library(tidyverse)
library(tm)
library(tidytext)
library(dplyr)
library(wordcloud)
library(tidyr); library(ggplot2); library(ggthemes)
library(tidyverse)
library(grid)
library(gridExtra)
library(magrittr)
library(stringr)
disneyland<- read.csv(file = "disneyland.csv", stringsAsFactors = F)
rides <- read_csv('rides.csv')
library(grid)
library(gridExtra)
Ho: Reviews from different Disney branches tend to focus on similar
topics.
Ha: Reviews from different Disney branches focus on different
topics.
# split dataset based on branch
hongkong <- disneyland %>%
filter(Branch == 'Disneyland_HongKong')
cali <- disneyland %>%
filter(Branch == 'Disneyland_California')
paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris')
Splited the three branches based on location and examine the reviews in the following.
hongkong$Review_Text <- gsub('Hong Kong','HongKong', hongkong$Review_Text)
# create corpus
corpus = Corpus(VectorSource(hongkong$Review_Text))
corpus[[1]][1]
## $content
## [1] "If you've ever been to Disneyland anywhere you'll find Disneyland HongKong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides its a Small World is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. "
# clean text
# convert to lower case
corpus = tm_map(corpus,FUN = content_transformer(tolower))
# remove urls
corpus = tm_map(corpus,
FUN = content_transformer(FUN = function(x)gsub(pattern = 'http[[:alnum:][:punct:]]*',
replacement = ' ',x = x)))
# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
corpus[[1]][1]
## $content
## [1] "if youve ever been to disneyland anywhere youll find disneyland hongkong very similar in the layout when you walk into main street it has a very familiar feel one of the rides its a small world is absolutely fabulous and worth doing the day we visited was fairly hot and relatively busy but the queues moved fairly well "
# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus[[1]][1]
## $content
## [1] " youve ever disneyland anywhere youll find disneyland hongkong similar layout walk main street familiar feel one rides small world absolutely fabulous worth day visited fairly hot relatively busy queues moved fairly well "
# strip white spaces
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus[[1]][1]
## $content
## [1] " youve ever disneyland anywhere youll find disneyland hongkong similar layout walk main street familiar feel one rides small world absolutely fabulous worth day visited fairly hot relatively busy queues moved fairly well "
# create dictionary
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))
# stem document
corpus = tm_map(corpus,FUN = stemDocument)
# create document term matrix
dtm = DocumentTermMatrix(corpus)
# remove sparse terms
xdtm = removeSparseTerms(dtm,sparse = 0.97)
xdtm
## <<DocumentTermMatrix (documents: 9147, terms: 292)>>
## Non-/sparse entries: 234093/2436831
## Sparsity : 91%
## Maximal term length: 10
## Weighting : term frequency (tf)
dim(xdtm)
## [1] 9147 292
# make dataframe and complete stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
word_freq_hk <- data.frame(word = colnames(xdtm), freq = colSums(xdtm))
# word cloud
# set.seed(617)
# wordcloud(words = word_freq_hk$word,word_freq_hk$freq,scale=c(2,0.5),max.words = 100,colors=brewer.pal(9,"Spectral"))
word_total_hk = sum(word_freq_hk$freq)
hk_30 <- word_freq_hk %>%
mutate(percentage = freq/word_total_hk*100) %>%
arrange(desc(percentage)) %>%
head(30)
# top 30 frequent words in Hong Kong
hk_30
## word freq percentage
## ride ride 9138 3.0537055
## park park 8088 2.7028201
## disneyland disneyland 7616 2.5450888
## day day 7045 2.3542740
## disney disney 6953 2.3235297
## time time 5279 1.7641181
## visit visit 4536 1.5158249
## one one 3914 1.3079671
## kid kid 3877 1.2956026
## show show 3413 1.1405446
## get get 3408 1.1388738
## place place 3231 1.0797245
## hongkong hongkong 3070 1.0259221
## food food 2999 1.0021955
## can can 2894 0.9671070
## enjoy enjoy 2889 0.9654361
## parad parad 2889 0.9654361
## great great 2878 0.9617602
## good good 2748 0.9183172
## will will 2473 0.8264187
## just just 2447 0.8177301
## like like 2435 0.8137200
## queue queue 2286 0.7639276
## attract attract 2257 0.7542365
## small small 2236 0.7472188
## fun fun 2220 0.7418720
## firework firework 2215 0.7402011
## realli realli 2213 0.7395328
## wait wait 2178 0.7278366
## love love 2172 0.7258315
#hk_30%>%
# ggplot(aes(x=reorder(word,percentage), y=percentage, fill=percentage))+
# geom_col(position='dodge')+
# coord_flip()+
# scale_fill_gradient2(low ="#FFEDF0", high = "#C84747",space ="Lab", guide = FALSE) +
# theme_economist()+
# theme(plot.background = element_blank())
The above result shows the top 30 frequent words in Hong Kong branch. There are words that are very common in this scenario but do not convey useful information, like ‘disneyland’, ‘disney’, ‘one’, and verbs like ‘get’, ‘can’, ‘will’, etc. Therefore, by manually filtering words that might convey more meaningful information, we would be able to know what topics do the reviewer care about when they visit to each branch.
hk_topwords <- subset(hk_30, rownames(hk_30) %in% c("ride", "park", "kid", "show", 'hongkong',
"food", "parad", "queue", "firework", "wait")) # 10 word
hk_topwords%>%
ggplot(aes(x=reorder(word,percentage), y=percentage, fill=percentage))+
geom_col(position='dodge')+
coord_flip()+
scale_fill_gradient2(low ="#FFEDF0", high = "#C84747",space ="Lab", guide = FALSE) +
theme_economist()+
theme(plot.background = element_blank()) +
labs(x = "Words", y = "Frequency in %") +
ggtitle("Top 10 frequent topics in Hong Kong with % distribution")
Above are the selected top 10 topics with frequency percentages for
Hong Kong branch. We noticed that a considerable proportion of reviewers
care about ride, the park itself, care about kid, the shows, etc. Looks
like people tend to be excited about the Hong Kong branch itself.
Following by food, parade, the queue and wait, as well as the firework.
Looks like the top topics generally cover most of the facilities that
are included in a typical Disneyland park.
Next, we would use the same process to first generate top 30 frequent
words for California branch and Paris branch, and then manually select
meaningful topics that can provide insights.
# create corpus
corpus = Corpus(VectorSource(cali$Review_Text))
corpus[[1]][1]
# clean text
# convert to lower case
corpus = tm_map(corpus,FUN = content_transformer(tolower))
# remove urls
corpus = tm_map(corpus,
FUN = content_transformer(FUN = function(x)gsub(pattern = 'http[[:alnum:][:punct:]]*',
replacement = ' ',x = x)))
# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
corpus[[1]][1]
# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus[[1]][1]
# strip white spaces
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus[[1]][1]
# create dictionary
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))
# stem document
corpus = tm_map(corpus,FUN = stemDocument)
# create document term matrix
dtm = DocumentTermMatrix(corpus)
# remove sparse terms
xdtm = removeSparseTerms(dtm,sparse = 0.97)
xdtm
dim(xdtm)
# make dataframe and complete stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
word_freq_cali <- data.frame(word = colnames(xdtm), freq = colSums(xdtm))
# word cloud
# set.seed(617)
# wordcloud(words = word_freq_cali$word,word_freq_cali$freq,scale=c(2,0.5),max.words = 100,colors=brewer.pal(9,"Spectral"))
word_total_ca = sum(word_freq_cali$freq)
ca_30 <- word_freq_cali %>%
mutate(percentage = freq/word_total_ca*100) %>%
arrange(desc(percentage)) %>%
head(30)
# top 30 frequent topics in California
ca_30
## word freq percentage
## park park 21691 3.1450362
## ride ride 20850 3.0230973
## time time 16646 2.4135481
## disneyland disneyland 15329 2.2225928
## day day 14834 2.1508214
## disney disney 12237 1.7742754
## get get 11930 1.7297626
## line line 9061 1.3137787
## pass pass 8027 1.1638562
## one one 7739 1.1220983
## wait wait 7580 1.0990445
## can can 7244 1.0503270
## visit visit 7217 1.0464122
## great great 7024 1.0184286
## place place 6596 0.9563717
## love love 6263 0.9080891
## just just 6233 0.9037394
## year year 5859 0.8495121
## kid kid 5837 0.8463223
## crowd crowd 5738 0.8319680
## like like 5303 0.7688962
## will will 5227 0.7578767
## fast fast 5201 0.7541069
## see see 5073 0.7355479
## peopl peopl 4628 0.6710261
## fun fun 4603 0.6674013
## even even 4481 0.6497122
## much much 4469 0.6479723
## enjoy enjoy 4399 0.6378228
## back back 4301 0.6236135
#ca_30%>%
# ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
# geom_col(position='dodge')+
# coord_flip()+
# scale_fill_gradient2(low ="#D6F1DF", high = "#00A632",space ="Lab", guide = FALSE) +
# theme_economist()+
# theme(plot.background = element_blank())
ca_topwords <- subset(ca_30, rownames(ca_30) %in% c("park", "ride", "line", "pass", "wait",
"kid", "crowd", "fast", "peopl", "back")) # 10 words
ca_topwords %>%
ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
geom_col(position='dodge')+
coord_flip()+
scale_fill_gradient2(low ="#D6F1DF", high = "#00A632",space ="Lab", guide = FALSE) +
theme_economist()+
theme(plot.background = element_blank()) +
labs(x = "Words", y = "Frequency in %") +
ggtitle("Top 10 frequent topics in California with % distribution")
Above is the top 10 most frequent topics in California branch. Comparing to the Hong Kong branch, we could easily see that words like ‘line’, ‘pass’, ‘crowd’, ‘fast’, and ‘people’ occured on the list. This might suggest that people have more to say about their queue experiences, waiting time, about the visiting crowd or visitor amount, and probably the speed and efficiency taking rides during their visits.
# create corpus
corpus = Corpus(VectorSource(paris$Review_Text))
corpus[[1]][1]
# clean text
# convert to lower case
corpus = tm_map(corpus,FUN = content_transformer(tolower))
# remove urls
corpus = tm_map(corpus,
FUN = content_transformer(FUN = function(x)gsub(pattern = 'http[[:alnum:][:punct:]]*',
replacement = ' ',x = x)))
# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
corpus[[1]][1]
# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus[[1]][1]
# strip white spaces
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus[[1]][1]
# create dictionary
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(paris$Review_Text))),
lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))
# stem document
corpus = tm_map(corpus,FUN = stemDocument)
# create document term matrix
dtm = DocumentTermMatrix(corpus)
# remove sparse terms
xdtm = removeSparseTerms(dtm,sparse = 0.97)
xdtm
dim(xdtm)
# make dataframe and complete stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
word_freq_paris <- data.frame(word = colnames(xdtm), freq = colSums(xdtm))
# wordcloud
#set.seed(617)
#wordcloud(words = word_freq_paris$word,word_freq_paris$freq,scale=c(2,0.5),max.words = 100,colors=brewer.pal(9,"Spectral"))
word_total_pr = sum(word_freq_paris$freq)
pr_30 <- word_freq_paris %>%
mutate(percentage = freq/word_total_ca*100) %>%
arrange(desc(percentage)) %>%
head(30)
# top 30 frequent topics in Paris
pr_30
## word freq percentage
## 1 park 21527 3.1212574
## 2 ride 18419 2.6706201
## 3 disney 14422 2.0910844
## 4 time 13378 1.9397120
## 5 day 12148 1.7613711
## 6 get 9485 1.3752555
## 7 one 8172 1.1848802
## 8 queue 8167 1.1841552
## 9 disneyland 7417 1.0754107
## 10 visit 6822 0.9891400
## 11 just 6351 0.9208485
## 12 can 5997 0.8695211
## 13 food 5931 0.8599516
## 14 wait 5785 0.8387826
## 15 pariah. 5479 0.7944149
## 16 good 5455 0.7909351
## 17 great 5174 0.7501921
## 18 show 5060 0.7336630
## 19 see 4953 0.7181487
## 20 love 4861 0.7048094
## 21 kid 4818 0.6985747
## 22 reallity 4788 0.6942249
## 23 place 4766 0.6910351
## 24 year 4733 0.6862503
## 25 magic 4646 0.6736360
## 26 will 4532 0.6571068
## 27 parad 4524 0.6559469
## 28 characted 4491 0.6511621
## 29 hour 4471 0.6482623
## 30 like 4420 0.6408676
#pr_30%>%
# ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
# geom_col(position='dodge')+
# coord_flip()+
# scale_fill_gradient2(low ="#D4DFFF", high = "#4169E2",space ="Lab", guide = FALSE) +
# theme_economist()+
# theme(plot.background = element_blank())
pr_topwords <- subset(pr_30, rownames(pr_30) %in% c(1, 2, 8, 13, 14,
15, 18, 21, 27, 28)) # 10 words
pr_topwords %>%
ggplot(aes(x=reorder(word, percentage), y=percentage, fill=percentage))+
geom_col(position='dodge')+
coord_flip()+
scale_fill_gradient2(low ="#D4DFFF", high = "#4169E2",space ="Lab", guide = FALSE) +
theme_economist()+
theme(plot.background = element_blank()) +
labs(x = "Words", y = "Frequency in %") +
ggtitle("Top 10 frequent topics in Paris with % distribution")
Above is the top 10 most frequent topics in Paris branch. Comparing to the previous two branches, reviewers also mentioned a lot about ‘queue’, ‘food’, ‘wait’, ‘show’, ‘parade’, and ‘kid’. However, what’s worth noticing is that they also mentioned a lot about ‘pariah’ and ‘character’, which are new to the other two branches.
head(paris[grep("pariah", paris$Review_Text, ignore.case = TRUE), "Review_Text"], 2)
## [1] "Currently on holiday at Eurodisney. Have been to Disney in Florida numerous times, although extremely busy in high season the USA staff are courteous, friendly and efficient. Eurodisney is pleasant and spotless. But Oh! The service! There are practically no guides or attendants to assist you. And don't even think of trying to eat between noon and 3pm. We stood in line for fast food fare for one hour, we then stood at the counter for another fifteen minutes, we then fought for a table for a further fifteen minutes! I know that a Disney visit entails a lot of waiting in line but having to spend 90 minutes to get a fast food burger and fries is unacceptable. On informing the staff of the problem they shrugged their shoulders and basically did not care. Take a tip from me: Pay the little extra and go to the real Disney where the staff will treat you like a guest, not a pariah!"
## [2] "It was a living hell.I took 2 trained carers and a severely mentally handicapped adult. It was a special once in a lifetime treat so stayed in the very expensive Disneyland Hotel. Took in doctors letter detailing ALL his medical problems. The staff in City Hall gave us a pass and explained how to use it., they were very pleasant.Sadly Disney have not told any of their ride operators about it and we were treated like S*$T.The 2 carers stated they would NEVER return and will advise all other parents NOT to consider it as they had never been anywhere so UNFRIENDLY to the disabled.Most of the disabled exits marked for your use are UNMANNED which ties in with the Disney approach of Who cares, we already have yer cash One of my carers assisted a family group who were in the wrong queue and would have stood there forever ignored by park staff and took them to where they could actually access a ride.Complete SO WHAT attitude by all ride staff. Told to go to the back of the queue , what was my problem ??? Must have inhaled around 100 cigs by passive smoking as its EVERYWHERE. Food is rubbish and overpriced.We hated it, we were made to feel like pariahs, my son was signing HOME by Sunday afternoon and was desparate to leave..........and this is someone who has been obsessed by Disney videos etc all his life. AWFUL."
head(paris[grep("character", paris$Review_Text, ignore.case = TRUE), "Review_Text"], 2)
## [1] "3rd time hereIt gets betterPlenty to do for allShopping Meet the Character'sRoller coasters and finish off with the fireworks display at 10 beside the castleSpace Mountain and Rockin roller coaster highly recomended or if you have the nerve tower of terror"
## [2] "Its a shame this park carry the name Disney... They build the place years back and they are now just collecting cash without any effort, it does not compare in anything with the American parks which are truly amazing. The whole park now is designed to get money from their guests on every step without offering any services. The staff are truly miserable and seems like they hate their job. In the whole park there were no characters walking around at all so a kid can take a picture. There were 3 characters who a kid could meet in the whole park but you had to wait forever just to take a picture. Many rides where closed and the wait time for anything was 60 minutes or more, who in the world can make a a kid stand still in a waiting line for 60 Minutes or more? Avoid at all costs especially if you have young children. PS. Parking is 30 to park 30 minutes walk from the entrance after spending 75 person for your ticket...just ridiculous "
Taking closer look at the reviews, we noticed that the reviewer who mentioned ‘pariah’ mostly talked about how they were treated like ‘pariah’ in some cases, some suggested otherwise. This might suggest that the customer services might be considered as polarized. For the ‘character’ word, this might suggest that visitors who went to Paris branch pay considerable attention to meeting with the characters in the park.
hk_sub <- filter(hk_topwords,
word %in% c('park', 'ride', 'wait', 'kid'))%>% mutate(branch = 'hk')
ca_sub <- filter(ca_topwords,
word %in% c('park', 'ride', 'wait', 'kid'))%>% mutate(branch = 'ca')
pr_sub <- filter(pr_topwords,
word %in% c('park', 'ride', 'wait', 'kid'))%>% mutate(branch = 'pr')
same_topwords = rbind(hk_sub, ca_sub, pr_sub)
same_topwords <- same_topwords %>%
select(word, percentage, branch)
ggplot(same_topwords, aes(x = word, y = percentage, fill = branch)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ branch, ncol = 3) +
labs(x = "Words", y = "Percentages in %", fill = "Branch") +
theme(plot.background = element_blank(),
#text=element_text(size=16),
strip.text=element_text(size=14),
axis.text.x=element_text(size=14, angle=45, hjust=1),
axis.text.y=element_text(size=14),
legend.position = "none") +
scale_fill_manual(values = c("hk" = "#EE4242", "ca" = "#28E92C", "pr" = "#4169E2", guide = FALSE)) +
ggtitle("Same frequent words/topics in all three branches")
These words, ‘kid’, ‘park’, ‘ride’, and ‘wait’ are topics that are mentioned for all three branches. These might suggest that reviewers care about these topics regardless of the branch locations.
Ho: Hong Kong Disneyland receives reviews related to shopping
experiences in similar proportion as other branches.
Ha: Hong Kong Disneyland receives reviews related to shopping
experiences in different proportion as other branches.
# examine branch names
unique(disneyland$Branch)
## [1] "Disneyland_HongKong" "Disneyland_California" "Disneyland_Paris"
Split dataset based on branch
hongkong <- disneyland %>%
filter(Branch == 'Disneyland_HongKong')
cali <- disneyland %>%
filter(Branch == 'Disneyland_California')
paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris')
Check document integrity
nrow(disneyland) == nrow(hongkong) + nrow(cali) + nrow(paris)
## [1] TRUE
Create corpus based on branches:
corpus_hk = Corpus(VectorSource(hongkong$Review_Text))
corpus_cali = Corpus(VectorSource(cali$Review_Text))
corpus_paris = Corpus(VectorSource(paris$Review_Text))
Define shopping-related words:
shopping_words <- c(
'shop',
'souvenir',
'merchandise',
'collectible',
'gift',
'shirt',
'toy',
'book',
'hat',
'band',
'art', 'mall', 'cashier', 'cloth', 'checkout', 'cart', 'outlet', 'bag', 'quality', 'boutique'
)
Cleaning texts:
corpus <- corpus_hk
# convert to lower cases
corpus = tm_map(corpus,FUN = content_transformer(tolower))
# remove punctuations
corpus = tm_map(corpus,FUN = removePunctuation)
# remove stopwords
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
# stripwhitespaces
corpus = tm_map(corpus,FUN = stripWhitespace)
# stem the corpurs
corpus = tm_map(corpus,FUN = stemDocument)
# convert corpus to document term matrix to get a list of unique words contained in all reviews
dtm_hk <- DocumentTermMatrix(corpus)
# select words unrelated to shopping to be removed
words_remove <- setdiff(colnames(dtm_hk), shopping_words)
length(words_remove)
## [1] 17979
# remove words unrelated to shopping
corpus = tm_map(corpus, FUN = removeWords, words_remove[1:4000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[4001:7000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[7001:10000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[10001:13000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[13001:16000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[16001:17987])
corpus = tm_map(corpus, FUN = removeNumbers)
Transform cleaned corpus to document term matrix
dtm_hk_then <- DocumentTermMatrix(corpus)
colnames(dtm_hk_then)
## [1] "gift" "shop" "souvenir" "book" "toy" "cloth"
## [7] "hat" "outlet" "bag" "mall" "cart" "cashier"
## [13] "shirt" "art" "band" "checkout"
Make data frame
dtm_hk_then = as.data.frame(as.matrix(dtm_hk_then))
sort(colSums(dtm_hk_then),decreasing = T)
## shop toy book souvenir bag gift outlet cart
## 860 738 416 400 194 133 107 78
## hat cloth shirt art band mall cashier checkout
## 76 68 48 24 19 18 16 1
Add information summarizing columns:
# add a column counting the number of shopping related words in a review
dtm_hk_then['shopping_word_count'] = rowSums(dtm_hk_then)
# add a column specifying if a review contains shopping related words
dtm_hk_then['if_shopping_word'] = ifelse(dtm_hk_then$shopping_word_count >= 1, 1, 0)
Attach word frequency to original data frame and select columns
hongkong <- hongkong %>%
cbind(dtm_hk_then) %>%
select(-Review_ID, -Reviewer_Location,
-Review_Text, -Year_Month,
-Year, -Month, -continent)
hongkong['no_shopping_word'] = ifelse(rowSums(hongkong[4:19]) == 0, 1, 0)
hongkong <- hongkong %>%
select_if(function(x) !all(x == 0))
Calculate average ratings for each shopping words and add counts
shopping_hk = data.frame(rd=names(colSums(hongkong[c(4:22)])))
df <- data.frame()
for (i in 1:nrow(shopping_hk)) {
for (k in shopping_hk[i,]) {
avg_rating <- hongkong %>%
filter(hongkong[,k] != 0) %>%
summarise(avg_rating = mean(Rating))
df <- rbind(df, avg_rating)
}
}
shopping_mention_count <- data.frame(rides = names(colSums(hongkong[c(4:22)])),
counts = as.numeric(colSums(hongkong[c(4:22)])),
avg_raing = df)
shopping_mention_count
## rides counts avg_rating
## 1 gift 133 4.237705
## 2 shop 860 4.196906
## 3 souvenir 400 4.185792
## 4 book 416 4.385445
## 5 toy 738 4.297638
## 6 cloth 68 4.348485
## 7 hat 76 4.027397
## 8 outlet 107 3.970588
## 9 bag 194 4.075949
## 10 mall 18 4.000000
## 11 cart 78 4.144928
## 12 cashier 16 3.545455
## 13 shirt 48 4.066667
## 14 art 24 4.523810
## 15 band 19 4.277778
## 16 checkout 1 4.000000
## 17 shopping_word_count 3196 4.241515
## 18 if_shopping_word 2033 4.241515
## 19 no_shopping_word 7114 4.208040
Get frequency of each shopping word in total branch reviews & in reviews that mentioned shopping in the branch
freq_all <- shopping_mention_count %>%
mutate(freq_branch_pct = counts/nrow(hongkong)*100,
freq_Mention_pct = counts/counts[17]*100) %>%
mutate_at(c(3:5), round, digits = 2)%>%
arrange(desc(freq_Mention_pct))
freq_all
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 no_shopping_word 7114 4.21 77.77 222.59
## 2 shopping_word_count 3196 4.24 34.94 100.00
## 3 if_shopping_word 2033 4.24 22.23 63.61
## 4 shop 860 4.20 9.40 26.91
## 5 toy 738 4.30 8.07 23.09
## 6 book 416 4.39 4.55 13.02
## 7 souvenir 400 4.19 4.37 12.52
## 8 bag 194 4.08 2.12 6.07
## 9 gift 133 4.24 1.45 4.16
## 10 outlet 107 3.97 1.17 3.35
## 11 cart 78 4.14 0.85 2.44
## 12 hat 76 4.03 0.83 2.38
## 13 cloth 68 4.35 0.74 2.13
## 14 shirt 48 4.07 0.52 1.50
## 15 art 24 4.52 0.26 0.75
## 16 band 19 4.28 0.21 0.59
## 17 mall 18 4.00 0.20 0.56
## 18 cashier 16 3.55 0.17 0.50
## 19 checkout 1 4.00 0.01 0.03
Proportions of reviews that mentioned shopping words & no mention rides in the branch
p_mention_orno <- freq_all[c(1,3), 1:4]
p_mention_orno%>%
ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
labs(fill = "Shopping Words Mentioned") +
ggtitle("Reviews that mentioned & did not mention shopping words in Hong Kong Branch") +
theme_void()+
geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
scale_fill_manual(values=c("#F1786C", "#E8CCC7"),
labels=c("Mentioned", "Did not mention"))
Proportions of shopping words mentioned in the branch over total number of reviews in the branch & over reviews that mentioned rides in branch
p_shops <- freq_all[4:19, 1:5]
# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_shops <- p_shops %>%
filter(freq_Mention_pct >= 2)
shopping_rank_plot <- hot_shops%>%
ggplot(aes(x=reorder(rides,freq_Mention_pct),
y=freq_Mention_pct, fill=freq_Mention_pct))+
geom_col(position='dodge')+
coord_flip()+
labs(x = "", y = "Frequency in %") +
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(0, 30)) +
ggtitle("All shopping words mentioned in Hong Kong with % distribution")+
scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE) +
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.title = element_text(size = 20),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15))
rating_rank_plot <- hot_shops %>%
ggplot() +
geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct),
width=0.7, position='dodge')+
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(-3, 30)) +
geom_text(aes(x=rides, y=-1.5, label=avg_rating), size = 5) +
labs(x = "Mostly mentioned shopping words", y = "Frequency in %") +
ggtitle("Average rating ranked with most popular shopping words & no shopping words") +
scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE)+
labs(tag ="Ratings:")+
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.y = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.tag.position = c(0.01,0.12),
plot.tag = element_text(hjust = 0, size=13),
plot.title = element_text(size = 20),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 13))
#grid.arrange(shopping_rank_plot, rating_rank_plot, ncol = 2)
shopping_rank_plot
rating_rank_plot
From the two graphs above, we can see that words including “shop”, “toy”, “book” and “souvenir” are mentioned most frequently in reviews of the Hong Kong brancn, indicating the most cared aspects of shopping. Products like book, cloth and toy received higher ratings while cart, bag hat received lower ratings also implies customer satisfaction.
Cleaning texts:
corpus <- corpus_cali
corpus = tm_map(corpus,FUN = content_transformer(tolower))
corpus = tm_map(corpus,FUN = removePunctuation)
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus = tm_map(corpus,FUN = stripWhitespace)
corpus = tm_map(corpus,FUN = stemDocument)
dtm_cali <- DocumentTermMatrix(corpus)
words_remove <- setdiff(colnames(dtm_cali), shopping_words)
length(words_remove)
## [1] 25632
corpus = tm_map(corpus, FUN = removeWords, words_remove[1:4000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[4001:7000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[7001:10000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[10001:13000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[13001:16000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[16001:19000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[19001:21000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[21001:24000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[24001:25632])
corpus = tm_map(corpus, FUN = removeNumbers)
Transform cleaned corpus to document term matrix
dtm_cali_then <- DocumentTermMatrix(corpus)
colnames(dtm_cali_then)
## [1] "book" "bag" "shop" "gift" "cloth" "band"
## [7] "cart" "shirt" "souvenir" "art" "toy" "hat"
## [13] "outlet" "cashier" "mall" "checkout"
Make data frame
dtm_cali_then = as.data.frame(as.matrix(dtm_cali_then))
sort(colSums(dtm_cali_then),decreasing = T)
## shop book bag souvenir toy gift hat band
## 864 777 392 308 189 173 168 139
## cloth cart shirt art outlet cashier mall checkout
## 123 123 98 87 53 21 16 4
Add information summarizing columns:
# add a column counting the number of shopping related words in a review
dtm_cali_then['shopping_word_count'] = rowSums(dtm_cali_then)
# add a column specifying if a review contains shopping related words
dtm_cali_then['if_shopping_word'] = ifelse(dtm_cali_then$shopping_word_count >= 1, 1, 0)
Attach word frequency to original data frame and select columns
cali <- cali %>%
cbind(dtm_cali_then) %>%
select(-Review_ID, -Reviewer_Location,
-Review_Text, -Year_Month,
-Year, -Month, -continent)
cali['no_shopping_word'] = ifelse(rowSums(cali[4:19]) == 0, 1, 0)
cali <- cali %>%
select_if(function(x) !all(x == 0))
Calculate average ratings for each shopping words and add counts
shopping_ca = data.frame(rd=names(colSums(cali[c(4:22)])))
# average rating for each shopping word
df <- data.frame()
for (i in 1:nrow(shopping_ca)) {
for (k in shopping_ca[i,]) {
avg_rating <- cali %>%
filter(cali[,k] != 0) %>%
summarise(avg_rating = mean(Rating))
df <- rbind(df, avg_rating)
}
}
# shopping words, counts, and average ratings
shopping_mention_count <- data.frame(rides = names(colSums(cali[c(4:22)])),
counts = as.numeric(colSums(cali[c(4:22)])),
avg_raing = df)
shopping_mention_count
## rides counts avg_rating
## 1 book 777 4.286441
## 2 bag 392 4.003247
## 3 shop 864 4.410397
## 4 gift 173 4.416667
## 5 cloth 123 4.385965
## 6 band 139 4.413223
## 7 cart 123 4.037383
## 8 shirt 98 4.116279
## 9 souvenir 308 4.117438
## 10 art 87 4.535211
## 11 toy 189 4.260606
## 12 hat 168 4.493151
## 13 outlet 53 4.060000
## 14 cashier 21 3.684211
## 15 mall 16 3.937500
## 16 checkout 4 4.500000
## 17 shopping_word_count 3535 4.267581
## 18 if_shopping_word 2332 4.267581
## 19 no_shopping_word 15870 4.435161
Get frequency of each shopping word in total branch reviews & in reviews that mentioned shopping in the branch
freq_all <- shopping_mention_count %>%
mutate(freq_branch_pct = counts/nrow(cali)*100,
freq_Mention_pct = counts/counts[17]*100) %>%
mutate_at(c(3:5), round, digits = 2)%>%
arrange(desc(freq_Mention_pct))
freq_all
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 no_shopping_word 15870 4.44 87.19 448.94
## 2 shopping_word_count 3535 4.27 19.42 100.00
## 3 if_shopping_word 2332 4.27 12.81 65.97
## 4 shop 864 4.41 4.75 24.44
## 5 book 777 4.29 4.27 21.98
## 6 bag 392 4.00 2.15 11.09
## 7 souvenir 308 4.12 1.69 8.71
## 8 toy 189 4.26 1.04 5.35
## 9 gift 173 4.42 0.95 4.89
## 10 hat 168 4.49 0.92 4.75
## 11 band 139 4.41 0.76 3.93
## 12 cloth 123 4.39 0.68 3.48
## 13 cart 123 4.04 0.68 3.48
## 14 shirt 98 4.12 0.54 2.77
## 15 art 87 4.54 0.48 2.46
## 16 outlet 53 4.06 0.29 1.50
## 17 cashier 21 3.68 0.12 0.59
## 18 mall 16 3.94 0.09 0.45
## 19 checkout 4 4.50 0.02 0.11
p_mention_orno <- freq_all[c(1,3), 1:4]
p_mention_orno%>%
ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
labs(fill = "Shopping Words Mentioned") +
ggtitle("Reviews that mentioned & did not mention shopping words in California Branch") +
theme_void()+
geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
scale_fill_manual(values=c("#00A632", "#D6F1DF"),
labels=c("Mentioned", "Did not mention"))
Proportions of shopping words mentioned in the branch over total number of reviews in the branch & over reviews that mentioned rides in branch
p_shops <- freq_all[4:19, 1:5]
# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_shops <- p_shops %>%
filter(freq_Mention_pct >= 2)
shopping_rank_plot <- hot_shops%>%
ggplot(aes(x=reorder(rides,freq_Mention_pct),
y=freq_Mention_pct, fill=freq_Mention_pct))+
geom_col(position='dodge')+
coord_flip()+
labs(x = "", y = "Frequency in %") +
ggtitle("All shopping words mentioned in California with % distribution")+
scale_fill_gradient(low ="#D6F1DF", high = "#00A632", guide = FALSE) +
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.title = element_text(size = 20),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 13))
rating_rank_plot <- hot_shops %>%
ggplot() +
geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct),
width=0.7, position='dodge')+
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(-3, 25)) +
geom_text(aes(x=rides, y=-1.2, label=avg_rating), size = 5) +
labs(x = "Mostly mentioned shopping words", y = "Frequency in %") +
ggtitle("Average rating ranked with most popular shopping words & no shopping words") +
scale_fill_gradient(low ="#D6F1DF", high = "#00A632", guide = FALSE)+
labs(tag ="Ratings:")+
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.y = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.tag.position = c(0.005,0.135),
plot.tag = element_text(hjust = 0, size=14),
plot.title = element_text(size = 20),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 13))
#grid.arrange(shopping_rank_plot, rating_rank_plot, ncol = 2)
shopping_rank_plot
rating_rank_plot
From the two graph above, we can observe that “shop”, “book”, “bag” and “souvenir” are mentioned most frequently. Art, hat, gift and band receive higher ratings on average, but more frequently mentioned products and words received relatively lower rating.
Cleaning texts:
corpus <- corpus_paris
corpus = tm_map(corpus,FUN = content_transformer(tolower))
corpus = tm_map(corpus,FUN = removePunctuation)
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
corpus = tm_map(corpus,FUN = stripWhitespace)
#dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
# lowfreq = 0)
#dict_corpus = Corpus(VectorSource(dict))
corpus = tm_map(corpus,FUN = stemDocument)
dtm_hk <- DocumentTermMatrix(corpus)
words_remove <- setdiff(colnames(dtm_hk), shopping_words)
length(words_remove)
## [1] 32589
corpus = tm_map(corpus, FUN = removeWords, words_remove[1:4000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[4001:7000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[7001:10000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[10001:13000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[13001:16000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[16001:19000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[19001:21000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[21001:24000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[24001:27000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[27001:30000])
corpus = tm_map(corpus, FUN = removeWords, words_remove[30001:32589])
corpus = tm_map(corpus, FUN = removeNumbers)
Transform cleaned corpus to document term matrix
dtm_paris_then <- DocumentTermMatrix(corpus)
colnames(dtm_paris_then)
## [1] "outlet" "book" "shop" "hat" "gift" "cart"
## [7] "souvenir" "shirt" "toy" "mall" "bag" "band"
## [13] "cloth" "cashier" "art" "checkout"
Make data frame
dtm_paris_then = as.data.frame(as.matrix(dtm_paris_then))
sort(colSums(dtm_paris_then),decreasing = T)
## book shop bag toy souvenir outlet gift hat
## 2299 2042 579 340 298 259 243 128
## cloth cart band shirt mall cashier art checkout
## 124 103 67 53 49 44 41 8
Add information summarizing columns:
# add a column counting the number of shopping related words in a review
dtm_paris_then['shopping_word_count'] = rowSums(dtm_paris_then)
# add a column specifying if a review contains shopping related words
dtm_paris_then['if_shopping_word'] = ifelse(dtm_paris_then$shopping_word_count >= 1, 1, 0)
Attach word frequency to original data frame and select columns
paris <- paris %>%
cbind(dtm_paris_then) %>%
select(-Review_ID, -Reviewer_Location,
-Review_Text, -Year_Month,
-Year, -Month, -continent)
paris['no_shopping_word'] = ifelse(rowSums(paris[4:19]) == 0, 1, 0)
paris <- paris %>%
select_if(function(x) !all(x == 0))
Calculate average ratings for each shopping words and add counts
shopping_paris = data.frame(rd=names(colSums(paris[c(4:22)])))
# average rating for each shopping word
df <- data.frame()
for (i in 1:nrow(shopping_paris)) {
for (k in shopping_paris[i,]) {
avg_rating <- paris %>%
filter(paris[,k] != 0) %>%
summarise(avg_rating = mean(Rating))
df <- rbind(df, avg_rating)
}
}
# shopping words, counts, and average ratings
shopping_mention_count <- data.frame(rides = names(colSums(paris[c(4:22)])),
counts = as.numeric(colSums(paris[c(4:22)])),
avg_raing = df)
shopping_mention_count
## rides counts avg_rating
## 1 outlet 259 3.530172
## 2 book 2299 3.844173
## 3 shop 2042 3.903348
## 4 hat 128 4.061947
## 5 gift 243 3.905830
## 6 cart 103 3.608696
## 7 souvenir 298 4.059041
## 8 shirt 53 3.653061
## 9 toy 340 3.858108
## 10 mall 49 3.195652
## 11 bag 579 3.745455
## 12 band 67 4.134615
## 13 cloth 124 3.965812
## 14 cashier 44 3.050000
## 15 art 41 3.975000
## 16 checkout 8 3.500000
## 17 shopping_word_count 6677 3.840624
## 18 if_shopping_word 3589 3.840624
## 19 no_shopping_word 9105 4.035695
Get frequency of each shopping word in total branch reviews & in reviews that mentioned shopping in the branch
freq_all <- shopping_mention_count %>%
mutate(freq_branch_pct = counts/nrow(paris)*100,
freq_Mention_pct = counts/counts[17]*100) %>%
mutate_at(c(3:5), round, digits = 2)%>%
arrange(desc(freq_Mention_pct))
freq_all
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 no_shopping_word 9105 4.04 71.73 136.36
## 2 shopping_word_count 6677 3.84 52.60 100.00
## 3 if_shopping_word 3589 3.84 28.27 53.75
## 4 book 2299 3.84 18.11 34.43
## 5 shop 2042 3.90 16.09 30.58
## 6 bag 579 3.75 4.56 8.67
## 7 toy 340 3.86 2.68 5.09
## 8 souvenir 298 4.06 2.35 4.46
## 9 outlet 259 3.53 2.04 3.88
## 10 gift 243 3.91 1.91 3.64
## 11 hat 128 4.06 1.01 1.92
## 12 cloth 124 3.97 0.98 1.86
## 13 cart 103 3.61 0.81 1.54
## 14 band 67 4.13 0.53 1.00
## 15 shirt 53 3.65 0.42 0.79
## 16 mall 49 3.20 0.39 0.73
## 17 cashier 44 3.05 0.35 0.66
## 18 art 41 3.98 0.32 0.61
## 19 checkout 8 3.50 0.06 0.12
p_mention_orno <- freq_all[c(1,3), 1:4]
p_mention_orno%>%
ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
labs(fill = "Shopping Words Mentioned") +
ggtitle("Reviews that mentioned & did not mention shopping words in Paris Branch") +
theme_void()+
geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
scale_fill_manual(values=c("#4169E2", "#D4DFFF"),
labels=c("Mentioned", "Did not mention"))
Proportions of shopping words mentioned in the branch over total number of reviews in the branch & over reviews that mentioned rides in branch
p_shops <- freq_all[4:19, 1:5]
# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_shops <- p_shops %>%
filter(freq_Mention_pct >= 2)
shopping_rank_plot <- hot_shops%>%
ggplot(aes(x=reorder(rides,freq_Mention_pct),
y=freq_Mention_pct, fill=freq_Mention_pct))+
geom_col(position='dodge')+
coord_flip()+
labs(x = "", y = "Frequency in %") +
ggtitle("All shopping words mentioned in Paris with % distribution")+
scale_fill_gradient(low ="#D4DFFF", high = "#4169E2", guide = FALSE) +
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(0, 40)) +
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.title = element_text(size = 20),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15))
rating_rank_plot <- hot_shops %>%
ggplot() +
geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct),
width=0.7, position='dodge')+
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(-4.5, 40)) +
geom_text(aes(x=rides, y=-2.5, label=avg_rating), size = 5) +
labs(x = "Mostly mentioned shopping words", y = "Frequency in %") +
ggtitle("Average rating ranked with most popular shopping words & no shopping words") +
scale_fill_gradient(low ="#D4DFFF", high = "#4169E2", guide = FALSE)+
labs(tag ="Ratings:")+
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.y = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.tag.position = c(0.01,0.125),
plot.tag = element_text(hjust = 0, size=15),
plot.title = element_text(size = 20),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 13))
# grid.arrange(shopping_rank_plot, rating_rank_plot, ncol = 2)
shopping_rank_plot
rating_rank_plot
From the visualizations, “book” and “shop” are the leading words mentioned in the reviews, with “bag” and “toy” followed. Generally these shopping-related words received lower rating in Paris branch compared to the other two with souvenir and gift most favored by reviewers.
Integrate the three branches and attach to original dataset
# rbind all 3 branches
tri_branch <- rbind(hongkong, cali, paris)
# cbind indicators to original dataset
disneyland <- cbind(disneyland, tri_branch[, 21])
colnames(disneyland)[11] <- 'if_shopping_words'
Distribution of sentiment scores with shopping words
afinn<-read.csv("afinn.csv")
dis_w_shop <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)%>%
summarize(reviewSentiment = mean(value))%>%
ungroup()%>%
ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Hongkong Branch (with shopping words)")
dis_w_shop
Distribution of sentiment scores without shopping words
dis_wt_shop <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)%>%
summarize(reviewSentiment = mean(value))%>%
ungroup()%>%
ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Hongkong Branch (without shopping words)")
dis_wt_shop
Distribution of sentiment scores with shopping words
dis_w_shop <- disneyland %>%
filter(Branch == 'Disneyland_California', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)%>%
summarize(reviewSentiment = mean(value))%>%
ungroup()%>%
ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in California Branch (with shopping words)")
dis_w_shop
Distribution of sentiment scores without shopping words
dis_wt_shop <- disneyland %>%
filter(Branch == 'Disneyland_California', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)%>%
summarize(reviewSentiment = mean(value))%>%
ungroup()%>%
ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in California Branch (without shopping words)")
dis_wt_shop
Distribution of sentiment scores with shopping words
dis_w_shop <- disneyland %>%
filter(Branch == 'Disneyland_Paris', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)%>%
summarize(reviewSentiment = mean(value))%>%
ungroup()%>%
ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Paris Branch (with shopping words)")
dis_w_shop
Distribution of sentiment scores without shopping words
dis_wt_shop <- disneyland %>%
filter(Branch == 'Disneyland_Paris', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)%>%
summarize(reviewSentiment = mean(value))%>%
ungroup()%>%
ggplot(aes(x=reviewSentiment,fill=reviewSentiment>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Paris Branch (without shopping words)")
dis_wt_shop
Calculate proportions and average scores required
##### hong kong#####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / n())
# proportion of positive sentiment without shopping words
pos_prop_wt_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / n())
# average sentiment score with shopping words
mean_w_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
# average sentiment score without shopping words
mean_wt_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
##### california #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / n())
# proportion of positive sentiment without shopping words
pos_prop_wt_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / n())
# average sentiment score with shopping words
mean_w_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
# average sentiment score without shopping words
mean_wt_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
##### paris #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / n())
# proportion of positive sentiment without shopping words
pos_prop_wt_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / n())
# average sentiment score with shopping words
mean_w_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', if_shopping_words == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
# average sentiment score without shopping words
mean_wt_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', if_shopping_words == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
Proportion of positive words in reviews w/wt shopping words
# make proportion plot
df_prop = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
prop_pos_words_w_shop = c(as.numeric(pos_prop_w_shop_hk), as.numeric(pos_prop_w_shop_ca), as.numeric(pos_prop_w_shop_paris)),
prop_pos_words_wt_shop = c(as.numeric(pos_prop_wt_shop_hk), as.numeric(pos_prop_wt_shop_ca), as.numeric(pos_prop_wt_shop_paris)))
df_prop_long <- tidyr::pivot_longer(df_prop, cols = c("prop_pos_words_w_shop", "prop_pos_words_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_prop_long, aes(x = branch, y = proportion, fill = category)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
scale_fill_manual(values = c("#0072B2", "#F0E442"),
labels=c("Mentioned", "Did not mention")) +
labs(title = "Proportion of Positive Words with and without Shopping words", x = "Branch", y = "Proportion", fill = 'Shopping words mentioned') +
geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(size = 14, face = "bold"),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12))
Average sentiment scores of reviews w/wt shopping words
# make average score plot
df_mean = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
mean_w_shop = c(as.numeric(mean_w_shop_hk), as.numeric(mean_w_shop_ca), as.numeric(mean_w_shop_paris)),
mean_wt_shop = c(as.numeric(mean_wt_shop_hk), as.numeric(mean_wt_shop_ca), as.numeric(mean_wt_shop_paris)))
df_mean_long <- tidyr::pivot_longer(df_mean, cols = c("mean_w_shop", "mean_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_mean_long, aes(x = branch, y = proportion, fill = category)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
scale_fill_manual(values = c("#0072B2", "#F0E442"),
labels=c("Mentioned", "Did not mention")) +
labs(title = "Average sentiment scores with and without Shopping words", x = "Branch", y = "Average sentiment score", fill = 'Shopping words mentioned') +
geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(size = 14, face = "bold"),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12))
From the proportion and average sentiment score graphs presented above, we observe that among all 3 branches, more positive words are in reviews unrelated to shopping experience, and reviews unrelated to shopping experience tend to have more positive tone than reviews that talked about shopping. We can thus conclude that it might be a common problem that Disneyland as a whole has to do better in improving travelers’ shopping experience.
Ho: California Disneyland receives reviews related to rides
experiences in similar proportion as other branches.
Ha: California Disneyland receives reviews related to rides experiences
in different proportion as other branches.
rides_name <- rides$Ride_name
#head(rides_name)
We matched ride features with reviews mentioned the ride, which is explained in detail in research question no.3.
#rides_name[1:14]
disneyland_ride <- disneyland %>%
mutate(Astro_Orbiter = case_when(grepl(c("astro orbiter", "Astro", "Orbiter"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Avatar_Flight = case_when(grepl(c("Avatar Flight of Passage", "Avatar Flight", "Avatar Ride"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Big_Thunder = case_when(grepl(c("Big Thunder Mountain Railroad", "Big Thunder"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Buzz_Lightyear = case_when(grepl(c("Buzz Lightyear's Space Ranger Spin", "Buzz Lightyear's", "Space Ranger Spin", "Space Ranger"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Dinosaur = case_when(grepl(c("Dinosaur"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Dumbo = case_when(grepl(c("Dumbo the Flying Elephant", "Flying Elephant"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Expedition_Everest = case_when(grepl(c("Expedition Everest"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Frozen_Ever = case_when(grepl(c("Frozen Ever After", "Ever After"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Gran_Fiesta = case_when(grepl(c("Gran Fiesta Tour Starring The Three Caballeros", "Gran Fiesta", "Starring The Three Caballeros", "Three Caballeros"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Haunted_Mansion = case_when(grepl(c("Haunted Mansion"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Small_World= case_when(grepl(c("It's a Small World", "Small World"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Journey_Into= case_when(grepl(c("Journey Into Imagination with Figment", "Imagination with Figment"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Jungle_Cruise= case_when(grepl(c("Jungle Cruise"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0)) %>%
mutate(Kali_River = case_when(grepl(c("Kali River Rapids", "Kali River"), Review_Text, ignore.case = TRUE) ~ 1,
TRUE ~ 0))
Because some of the rides only exists in the Orlando Disney, so we
want to exclude the column associated with rides not being mentioned in
any of the Review_Text.
# rides mentioned in the Disneyland dataset: 27-3 = 24 rides. There are 40043 reviews. dropped other columns
disneyland_ride2 <- disneyland_ride %>%
select_if(function(x) !all(x == 0)) %>%
select(c(2,6,9,11:34))
# disneyland_ride2: 3 basic info + 24 rides + 2 mention columns. 29 cols, 40043 reviews
disneyland_ride2$mention_ride <- ifelse(rowSums(disneyland_ride2[c(4:27)]) == 0, 0, 1)
disneyland_ride2$no_mention_ride <- ifelse(rowSums(disneyland_ride2[c(4:27)]) == 0, 1, 0)
#head(disneyland_ride2)
Split dataset based on branch for those mentioned at least one ride
# also remove ride columns that are not in branch
hongkong <- disneyland_ride2 %>%
filter(Branch == 'Disneyland_HongKong') %>%
select_if(function(x) !all(x == 0)) # HK: 1:3(info) + 4:19(16 rides) + 20:21(mention cols)
cali <- disneyland_ride2 %>%
filter(Branch == 'Disneyland_California') %>%
select_if(function(x) !all(x == 0)) # CA: 1:3(info) + 4:23(20 rides) + 24:25(mention cols)
paris <- disneyland_ride2 %>%
filter(Branch == 'Disneyland_Paris') %>%
select_if(function(x) !all(x == 0)) # PR: 1:3(info) + 4:20(17 rides) + 21:22(mention cols)
We structured the columns to be: 3 columns of basic information, ride
dummy variables, and 2 columns indicating whether the line of the review
mentioned rides or not.
hongkong: 3 info + 16 rides + 2 mention columns for Hong
Kong
cali: 3 info + 20 rides + 2 mention columns for
California
paris: 3 info + 17 rides + 2 mention columns for Paris
#head(hongkong)
rides_hk = data.frame(rd=names(colSums(hongkong[c(4:21)])))
rides_hk # HK: 4:19(16 rides) + 20:21(2 mention cols)
## rd
## 1 Big_Thunder
## 2 Dinosaur
## 3 Dumbo
## 4 Expedition_Everest
## 5 Haunted_Mansion
## 6 Small_World
## 7 Jungle_Cruise
## 8 Pirates
## 9 Seven_Dwarfs
## 10 Space_Mountain
## 11 Splash_Mountain
## 12 Star_Tours
## 13 Winnie_Pooh
## 14 Tomorrowland_Speedway
## 15 Toy_Story
## 16 Under_Sea
## 17 mention_ride
## 18 no_mention_ride
colSums(hongkong[c(4:21)])
## Big_Thunder Dinosaur Dumbo
## 6 1 6
## Expedition_Everest Haunted_Mansion Small_World
## 10 94 179
## Jungle_Cruise Pirates Seven_Dwarfs
## 85 29 1
## Space_Mountain Splash_Mountain Star_Tours
## 785 17 21
## Winnie_Pooh Tomorrowland_Speedway Toy_Story
## 20 1 3
## Under_Sea mention_ride no_mention_ride
## 1 1008 8139
# get the avg rating for each ride
df <- data.frame()
for (i in 1:nrow(rides_hk)) {
for (k in rides_hk[i,]) {
avg_rating <- hongkong %>%
filter(hongkong[,k] != 0) %>%
summarise(avg_rating = mean(Rating))
df <- rbind(df, avg_rating)
}
}
# ride names, counts, and average ratings
ride_mention_count <- data.frame(rides = names(colSums(hongkong[c(4:21)])),
counts = as.numeric(colSums(hongkong[c(4:21)])),
avg_raing = df)
# Frequency of each ride in total branch reviews & in reviews that mentioned rides in the branch
freq_all <- ride_mention_count %>%
mutate(freq_branch_pct = counts/nrow(hongkong)*100,
freq_Mention_pct = counts/counts[17]*100) %>%
mutate_at(c(3:5), round, digits = 2)%>%
arrange(desc(freq_Mention_pct))
freq_all
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 no_mention_ride 8139 4.21 88.98 807.44
## 2 mention_ride 1008 4.26 11.02 100.00
## 3 Space_Mountain 785 4.26 8.58 77.88
## 4 Small_World 179 4.35 1.96 17.76
## 5 Haunted_Mansion 94 4.31 1.03 9.33
## 6 Jungle_Cruise 85 4.25 0.93 8.43
## 7 Pirates 29 3.76 0.32 2.88
## 8 Star_Tours 21 4.29 0.23 2.08
## 9 Winnie_Pooh 20 4.15 0.22 1.98
## 10 Splash_Mountain 17 3.71 0.19 1.69
## 11 Expedition_Everest 10 4.20 0.11 0.99
## 12 Big_Thunder 6 4.50 0.07 0.60
## 13 Dumbo 6 4.50 0.07 0.60
## 14 Toy_Story 3 4.67 0.03 0.30
## 15 Dinosaur 1 3.00 0.01 0.10
## 16 Seven_Dwarfs 1 4.00 0.01 0.10
## 17 Tomorrowland_Speedway 1 3.00 0.01 0.10
## 18 Under_Sea 1 4.00 0.01 0.10
# proportions of reviews that mentioned rides & no mention rides in the branch
p_mention_orno <- freq_all[1:2, 1:4]
p_mention_orno
## rides counts avg_rating freq_branch_pct
## 1 no_mention_ride 8139 4.21 88.98
## 2 mention_ride 1008 4.26 11.02
p_mention_orno%>%
ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
labs(fill = "Ride Experience Mentioned") +
ggtitle("Reviews that mentioned & did not mention ride experiences in Hong Kong") +
theme_void()+
geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
scale_fill_manual(values=c("#F1786C", "#E8CCC7"),
labels=c("Mentioned", "Did not mention"))
88.98% did not mention ride experiences in HK branch with an average rating of 4.21; 11.02% mentioned ride experiences in HK branch with a slightly higher average rating of 4.26.
# proportions of rides mentioned in the branch
# over total number of reviews in the branch & over reviews that mentioned rides in branch
p_rides <- freq_all[3:18, 1:5]
# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_rides <- p_rides %>%
filter(freq_Mention_pct >= 2)
hot_rides
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 Space_Mountain 785 4.26 8.58 77.88
## 2 Small_World 179 4.35 1.96 17.76
## 3 Haunted_Mansion 94 4.31 1.03 9.33
## 4 Jungle_Cruise 85 4.25 0.93 8.43
## 5 Pirates 29 3.76 0.32 2.88
## 6 Star_Tours 21 4.29 0.23 2.08
#
ride_rank_plot <- hot_rides%>%
ggplot(aes(x=reorder(rides,freq_Mention_pct),
y=freq_Mention_pct, fill=freq_Mention_pct))+
geom_col(position='dodge')+
coord_flip()+
labs(x = "", y = "Frequency in %") +
ggtitle("All rides mentioned in Hong Kong with % distribution")+
scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE) +
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.title = element_text(size = 15, hjust = -0.267, vjust = 1))
# rating
rating_rank_plot <- hot_rides %>%
ggplot() +
geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct),
width=0.7, position='dodge')+
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(-10, 80)) +
geom_text(aes(x=rides, y=-5, label=avg_rating), size = 4.5) +
labs(x = "Mostly mentioned rides", y = "Frequency in %") +
ggtitle("Average rating ranked with popular rides & no ride") +
scale_fill_gradient(low = "#FCDADA", high = "#C50F0F", guide = FALSE)+
labs(tag ="Ratings:")+
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.y = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.tag.position = c(0.01,0.115),
plot.tag = element_text(hjust = 0, size=12.5),
plot.title = element_text(size = 15)) +
geom_vline(xintercept = 5.5, linetype = "dashed") +
geom_text(aes(label = "No rides mentioned", x = 5.5, y=50), angle = 45, color = "#7C7C7C", size = 4)+
geom_text(aes(label = "4.21", x = 5.5, y=-5), size = 4)
grid.arrange(ride_rank_plot, rating_rank_plot, ncol = 2)
From the plot above, We can see that the mostly mentioned ride in Hong
Kong branch is Space_Mountain ride, following by Small_World and
Haunted_Mansion. Reviews who mentioned Small_World, Haunted_Mansion, and
Star_Tours tend to give higher ratings on average. Space_Mountain, the
ride that significantly more people mentioned about, obtained an average
ratings that is only slightly higher than those that did not mention any
rides. Pirates ride received lower rating than those did not mention
rides, which might suggest that visitors might have bad impressions on
this ride and caused them to leave a relatively neutral or negative
ratings.
#head(cali)
rides_ca = data.frame(rd=names(colSums(cali[c(4:25)])))
rides_ca # CA: 1:3(info) + 4:23(20 rides) + 24:25(2 mention cols)
## rd
## 1 Astro_Orbiter
## 2 Big_Thunder
## 3 Dinosaur
## 4 Dumbo
## 5 Expedition_Everest
## 6 Haunted_Mansion
## 7 Small_World
## 8 Jungle_Cruise
## 9 Mad_Tea
## 10 Peter_Pan
## 11 Pirates
## 12 Soarin_Around
## 13 Space_Mountain
## 14 Splash_Mountain
## 15 Star_Tours
## 16 Test_Track
## 17 Winnie_Pooh
## 18 Twilight_Zone
## 19 Toy_Story
## 20 Under_Sea
## 21 mention_ride
## 22 no_mention_ride
colSums(cali[c(4:25)])
## Astro_Orbiter Big_Thunder Dinosaur Dumbo
## 8 85 27 5
## Expedition_Everest Haunted_Mansion Small_World Jungle_Cruise
## 4 828 401 244
## Mad_Tea Peter_Pan Pirates Soarin_Around
## 23 48 663 4
## Space_Mountain Splash_Mountain Star_Tours Test_Track
## 1439 840 548 10
## Winnie_Pooh Twilight_Zone Toy_Story Under_Sea
## 4 7 20 15
## mention_ride no_mention_ride
## 3079 15123
# get the avg rating for each ride
df <- data.frame()
for (i in 1:nrow(rides_ca)) {
for (k in rides_ca[i,]) {
avg_rating <- cali %>%
filter(cali[,k] != 0) %>%
summarise(avg_rating = mean(Rating))
df <- rbind(df, avg_rating)
}
}
# ride names, counts, and average ratings
ride_mention_count <- data.frame(rides = names(colSums(cali[c(4:25)])),
counts = as.numeric(colSums(cali[c(4:25)])),
avg_raing = df)
# Frequency of each ride in total branch reviews & in reviews that mentioned rides in the branch
freq_all <- ride_mention_count %>%
mutate(freq_branch_pct = counts/nrow(cali)*100,
freq_Mention_pct = counts/counts[21]*100) %>%
mutate_at(c(3:5), round, digits = 2)%>%
arrange(desc(freq_Mention_pct))
freq_all
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 no_mention_ride 15123 4.43 83.08 491.17
## 2 mention_ride 3079 4.36 16.92 100.00
## 3 Space_Mountain 1439 4.37 7.91 46.74
## 4 Splash_Mountain 840 4.38 4.61 27.28
## 5 Haunted_Mansion 828 4.43 4.55 26.89
## 6 Pirates 663 4.42 3.64 21.53
## 7 Star_Tours 548 4.52 3.01 17.80
## 8 Small_World 401 4.36 2.20 13.02
## 9 Jungle_Cruise 244 4.37 1.34 7.92
## 10 Big_Thunder 85 4.59 0.47 2.76
## 11 Peter_Pan 48 4.31 0.26 1.56
## 12 Dinosaur 27 4.37 0.15 0.88
## 13 Mad_Tea 23 4.78 0.13 0.75
## 14 Toy_Story 20 4.25 0.11 0.65
## 15 Under_Sea 15 4.40 0.08 0.49
## 16 Test_Track 10 4.30 0.05 0.32
## 17 Astro_Orbiter 8 4.62 0.04 0.26
## 18 Twilight_Zone 7 4.43 0.04 0.23
## 19 Dumbo 5 4.80 0.03 0.16
## 20 Expedition_Everest 4 4.50 0.02 0.13
## 21 Soarin_Around 4 4.50 0.02 0.13
## 22 Winnie_Pooh 4 5.00 0.02 0.13
# proportions of reviews that mentioned rides & no mention rides in the branch
p_mention_orno <- freq_all[1:2, 1:4]
p_mention_orno
## rides counts avg_rating freq_branch_pct
## 1 no_mention_ride 15123 4.43 83.08
## 2 mention_ride 3079 4.36 16.92
p_mention_orno%>%
ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
labs(fill = "Ride Experience Mentioned") +
ggtitle("Reviews that mentioned & did not mention ride experiences in California") +
theme_void()+
geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
scale_fill_manual(values=c("#19B34C", "#D6F1DF"),
labels=c("Mentioned", "Did not mention"))
83.08% did not mention ride experiences in CA branch with an average rating of 4.43; 16.92% mentioned ride experiences in CA branch with a slightly lower average rating of 4.36. This finding is different from What we had in HK branch, where the average rating is slightly higher for reviews that mentioned raide experiences.
# proportions of rides mentioned in the branch
# over total number of reviews in the branch & over reviews that mentioned rides in branch
p_rides <- freq_all[3:22, 1:5]
# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_rides <- p_rides %>%
filter(freq_Mention_pct >= 2)
hot_rides
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 Space_Mountain 1439 4.37 7.91 46.74
## 2 Splash_Mountain 840 4.38 4.61 27.28
## 3 Haunted_Mansion 828 4.43 4.55 26.89
## 4 Pirates 663 4.42 3.64 21.53
## 5 Star_Tours 548 4.52 3.01 17.80
## 6 Small_World 401 4.36 2.20 13.02
## 7 Jungle_Cruise 244 4.37 1.34 7.92
## 8 Big_Thunder 85 4.59 0.47 2.76
#
ride_rank_plot <- hot_rides%>%
ggplot(aes(x=reorder(rides,freq_Mention_pct),
y=freq_Mention_pct, fill=freq_Mention_pct))+
geom_col(position='dodge')+
coord_flip()+
labs(x = "", y = "Frequency in %") +
ggtitle("All rides mentioned in California with % distribution")+
scale_fill_gradient(low = "#D6F1DF", high = "#19B34C", guide = FALSE) +
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.title = element_text(size = 15, hjust = -0.267, vjust = 1))
#
rating_rank_plot <- hot_rides %>%
ggplot() +
geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct),
width=0.7, position='dodge')+
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(-10, 80)) +
geom_text(aes(x=rides, y=-5, label=avg_rating), size = 4.5) +
labs(x = "Mostly mentioned rides", y = "Frequency in %") +
ggtitle("Average rating ranked with popular rides & no ride") +
scale_fill_gradient(low = "#D6F1DF", high = "#19B34C", guide = FALSE)+
labs(tag ="Ratings:")+
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.y = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.tag.position = c(0.01,0.108),
plot.tag = element_text(hjust = 0, size=12.5),
plot.title = element_text(size = 15)) +
geom_vline(xintercept = 3, linetype = "dashed") +
geom_text(aes(label = "No rides mentioned", x = 3, y=50), angle = 45, color = "#7C7C7C", size = 4)+
geom_text(aes(label = " ", x = 5.5, y=-5), size = 5)
grid.arrange(ride_rank_plot, rating_rank_plot, ncol = 2)
From the plot above, We can see that the mostly mentioned ride in
California branch is also Space_Mountain ride, which is the same as in
Hong Kong branch, following by Splash_Mountain and Haunted_Mansion.
However, when it comes to average ratings, we found that in California,
rides that people tend to mention more about did not lead to higher
ratings. The rating even tend to be lower than those did not mention any
rides. The mostly mentioned ride, Space_Mountain, received an average
rating of 4.37. Although this average rating is slightly higher than the
same ride in Hong Kong branch, it is even lower than the ratings that
did not mention rides in California. This might suggest that consumers
are not entirely happy about their general ride experiences.
#head(paris)
rides_pr = data.frame(rd=names(colSums(paris[c(4:22)])))
rides_pr # PR: 4:20(17 rides) + 21:22(2 mention cols)
## rd
## 1 Astro_Orbiter
## 2 Big_Thunder
## 3 Dinosaur
## 4 Dumbo
## 5 Haunted_Mansion
## 6 Small_World
## 7 Jungle_Cruise
## 8 Mission_Space
## 9 Peter_Pan
## 10 Pirates
## 11 Rock_Roller
## 12 Space_Mountain
## 13 Splash_Mountain
## 14 Star_Tours
## 15 Twilight_Zone
## 16 Toy_Story
## 17 Under_Sea
## 18 mention_ride
## 19 no_mention_ride
colSums(paris[c(4:22)])
## Astro_Orbiter Big_Thunder Dinosaur Dumbo Haunted_Mansion
## 1 27 1 17 141
## Small_World Jungle_Cruise Mission_Space Peter_Pan Pirates
## 286 8 5 78 601
## Rock_Roller Space_Mountain Splash_Mountain Star_Tours Twilight_Zone
## 5 1241 20 334 16
## Toy_Story Under_Sea mention_ride no_mention_ride
## 1 2 1951 10743
# get the avg rating for each ride
df <- data.frame()
for (i in 1:nrow(rides_pr)) {
for (k in rides_pr[i,]) {
avg_rating <- paris %>%
filter(paris[,k] != 0) %>%
summarise(avg_rating = mean(Rating))
df <- rbind(df, avg_rating)
}
}
# ride names, counts, and average ratings
ride_mention_count <- data.frame(rides = names(colSums(paris[c(4:22)])),
counts = as.numeric(colSums(paris[c(4:22)])),
avg_raing = df)
# Frequency of each ride in total branch reviews & in reviews that mentioned rides in the branch
freq_all <- ride_mention_count %>%
mutate(freq_branch_pct = counts/nrow(paris)*100,
freq_Mention_pct = counts/counts[18]*100) %>%
mutate_at(c(3:5), round, digits = 2)%>%
arrange(desc(freq_Mention_pct))
freq_all
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 no_mention_ride 10743 3.98 84.63 550.64
## 2 mention_ride 1951 3.97 15.37 100.00
## 3 Space_Mountain 1241 4.01 9.78 63.61
## 4 Pirates 601 3.96 4.73 30.80
## 5 Star_Tours 334 3.96 2.63 17.12
## 6 Small_World 286 4.17 2.25 14.66
## 7 Haunted_Mansion 141 3.73 1.11 7.23
## 8 Peter_Pan 78 4.18 0.61 4.00
## 9 Big_Thunder 27 3.89 0.21 1.38
## 10 Splash_Mountain 20 3.95 0.16 1.03
## 11 Dumbo 17 4.18 0.13 0.87
## 12 Twilight_Zone 16 3.88 0.13 0.82
## 13 Jungle_Cruise 8 3.88 0.06 0.41
## 14 Mission_Space 5 4.20 0.04 0.26
## 15 Rock_Roller 5 3.40 0.04 0.26
## 16 Under_Sea 2 5.00 0.02 0.10
## 17 Astro_Orbiter 1 4.00 0.01 0.05
## 18 Dinosaur 1 3.00 0.01 0.05
## 19 Toy_Story 1 5.00 0.01 0.05
# proportions of reviews that mentioned rides & no mention rides in the branch
p_mention_orno <- freq_all[1:2, 1:4]
p_mention_orno
## rides counts avg_rating freq_branch_pct
## 1 no_mention_ride 10743 3.98 84.63
## 2 mention_ride 1951 3.97 15.37
# avg_rating frequency %
# HK: no_mention: 4.21 88.98
# mentioned: 4.26 11.02
# CA: no_mention: 4.43 83.08
# mentioned: 4.36 16.92
p_mention_orno%>%
ggplot(aes(x="", y=freq_branch_pct, fill=rides)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
labs(fill = "Ride Experience Mentioned") +
ggtitle("Reviews that mentioned & did not mention ride experiences in Paris") +
theme_void()+
geom_text(aes(label = paste0(freq_branch_pct, "%")), position = position_stack(vjust=0.5)) +
scale_fill_manual(values=c("#4169E2", "#BACBFF"),
labels=c("Mentioned", "Did not mention"))
84.63% did not mention ride experiences in PR branch with an average rating of 3.98; 15.37% mentioned ride experiences in PR branch with a very similar average rating of 3.97. Comparing to the two branches before, the two average ratings are much lower than in HK and CA. This might suggest that Paris branch receive generally lower ratings compared to other branches. The proportions of rides mentioned in the review text verses did not are similar compared to other two branches.
# proportions of rides mentioned in the branch
# over total number of reviews in the branch & over reviews that mentioned rides in branch
p_rides <- freq_all[3:19, 1:5]
# filter freqs >= 2% of the reviews that mentioned rides in the branch, rating high to low
hot_rides <- p_rides %>%
filter(freq_Mention_pct >= 2)
hot_rides
## rides counts avg_rating freq_branch_pct freq_Mention_pct
## 1 Space_Mountain 1241 4.01 9.78 63.61
## 2 Pirates 601 3.96 4.73 30.80
## 3 Star_Tours 334 3.96 2.63 17.12
## 4 Small_World 286 4.17 2.25 14.66
## 5 Haunted_Mansion 141 3.73 1.11 7.23
## 6 Peter_Pan 78 4.18 0.61 4.00
ride_rank_plot <- hot_rides%>%
ggplot(aes(x=reorder(rides,freq_Mention_pct),
y=freq_Mention_pct, fill=freq_Mention_pct))+
geom_col(position='dodge')+
coord_flip()+
labs(x = "", y = "Frequency in %") +
ggtitle("All rides mentioned in Paris with % distribution")+
scale_fill_gradient(low = "#D4DFFF", high = "#083DDC", guide = FALSE) +
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.title = element_text(size = 15, hjust = -0.267, vjust = 1))
rating_rank_plot <- hot_rides %>%
ggplot() +
geom_col(aes(x=reorder(rides, -avg_rating), y=freq_Mention_pct, fill=freq_Mention_pct),
width=0.7, position='dodge')+
scale_y_continuous(name = "Frequency in %",
expand = c(0,0),
limits = c(-10, 80)) +
geom_text(aes(x=rides, y=-5, label=avg_rating), size = 4.5) +
labs(x = "Mostly mentioned rides", y = "Frequency in %") +
ggtitle("Average rating ranked with popular rides & no ride") +
scale_fill_gradient(low = "#D4DFFF", high = "#083DDC", guide = FALSE)+
labs(tag ="Ratings:")+
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.grid.major.y = element_line(color = "light grey"),
axis.line = element_line(color = "black"),
plot.tag.position = c(0.01,0.115),
plot.tag = element_text(hjust = 0, size=12.5),
plot.title = element_text(size = 15)) +
geom_vline(xintercept = 3.5, linetype = "dashed") +
geom_text(aes(label = "No rides mentioned", x = 3.5, y=70), angle = 30, color = "#7C7C7C", size = 4)+
geom_text(aes(label = "3.98", x = 3.5, y=-5), size = 4)
grid.arrange(ride_rank_plot, rating_rank_plot, ncol = 2)
The mostly mentioned ride in Paris branch is also Space_Mountain ride,
following by Pirates and Star_Tours. As mentioned before, the general
rating for Paris branch is much lower than other two branches. This is
also true for the reviews regardless of mentioning or not mentioning
ride experiences. Peter_Pan and Small_World received similarly higher
average ratings, following by Space_Mountain. Reviewers who did not
mention any ride rated slightly higher on average than other rides
including Pirates, Star_Tours, and Haunted Mansion. This might suggest
that apart from Peter_Pan and Small_World, rides that are most
frequently mentioned in Paris branch might tend to receive more neutral
ratings. People tend not to give higher ratings when they mention rides
that are popular or mostly mentioned compared with the ratings without
mentioning rides, in which case, these most mentioned rides tend not to
be more favorable among visitors.
Integrate the three branches and attach to original dataset
# rbind all 3 branches
tri_branch <- rbind(hongkong[,20:21], cali[,24:25], paris[,21:22])
# cbind indicators to original dataset
disneyland <- cbind(disneyland, tri_branch)
Distribution of sentiment scores with ride experience
dis_w_shop <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', mention_ride == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)
agg_data <- aggregate(dis_w_shop[, 'value'], by = list(dis_w_shop$Review_ID), FUN = function(x) mean(x))
colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
ggplot(aes(x=mean_value,fill=mean_value>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Hongkong Branch (contain ride experience)")
Distribution of sentiment scores without ride experience
dis_wt_shop <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', mention_ride == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)
agg_data <- aggregate(dis_wt_shop[, 'value'], by = list(dis_wt_shop$Review_ID), FUN = function(x) mean(x))
colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
ggplot(aes(x=mean_value,fill=mean_value>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Hongkong Branch (not contain ride experience)")
Distribution of sentiment scores with ride experience
dis_w_shop <- disneyland %>%
filter(Branch == 'Disneyland_California', mention_ride == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)
agg_data <- aggregate(dis_w_shop[, 'value'], by = list(dis_w_shop$Review_ID), FUN = function(x) mean(x))
colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
ggplot(aes(x=mean_value,fill=mean_value>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in California Branch (contain ride experience)")
Distribution of sentiment scores without ride experience
dis_wt_shop <- disneyland %>%
filter(Branch == 'Disneyland_California', mention_ride == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)
agg_data <- aggregate(dis_wt_shop[, 'value'], by = list(dis_wt_shop$Review_ID), FUN = function(x) mean(x))
colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
ggplot(aes(x=mean_value,fill=mean_value>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in California Branch (not contain ride experience)")
Distribution of sentiment scores with ride experience
dis_w_shop <- disneyland %>%
filter(Branch == 'Disneyland_Paris', mention_ride == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)
agg_data <- aggregate(dis_w_shop[, 'value'], by = list(dis_w_shop$Review_ID), FUN = function(x) mean(x))
colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
ggplot(aes(x=mean_value,fill=mean_value>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Paris Branch (contain ride experience)")
Distribution of sentiment scores without ride experience
dis_wt_shop <- disneyland %>%
filter(Branch == 'Disneyland_Paris', mention_ride == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn)
agg_data <- aggregate(dis_wt_shop[, 'value'], by = list(dis_wt_shop$Review_ID), FUN = function(x) mean(x))
colnames(agg_data) <- c("Review_ID", "mean_value")
agg_data %>%
ggplot(aes(x=mean_value,fill=mean_value>0))+
geom_histogram(binwidth = 0.1)+
scale_x_continuous(breaks=seq(-5,5,1))+
scale_fill_manual(values=c('tomato','seagreen'))+
theme_bw() +
labs(x = "Review Sentiment Scores", y = "Number of Reviews") +
ggtitle("Distribution of Review Sentiment in Paris Branch (not contain ride experience)")
Calculate proportions and average scores required
##### hong kong#####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', mention_ride == 1) %>%
select(Review_ID, Review_Text) %>%
unnest_tokens(output = word, input = Review_Text) %>%
inner_join(afinn) %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / NROW(word))
# proportion of positive sentiment without shopping words
pos_prop_wt_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', mention_ride == 0) %>%
select(Review_ID, Review_Text) %>%
unnest_tokens(output = word, input = Review_Text) %>%
inner_join(afinn) %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / NROW(word))
# average sentiment score with shopping words
mean_w_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', mention_ride == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
# average sentiment score without shopping words
mean_wt_shop_hk <- disneyland %>%
filter(Branch == 'Disneyland_HongKong', mention_ride == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
##### california #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', mention_ride == 1) %>%
select(Review_ID, Review_Text) %>%
unnest_tokens(output = word, input = Review_Text) %>%
inner_join(afinn) %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / NROW(word))
# proportion of positive sentiment without shopping words
pos_prop_wt_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', mention_ride == 0) %>%
select(Review_ID, Review_Text) %>%
unnest_tokens(output = word, input = Review_Text) %>%
inner_join(afinn) %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / NROW(word))
# average sentiment score with shopping words
mean_w_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', mention_ride == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
# average sentiment score without shopping words
mean_wt_shop_ca <- disneyland %>%
filter(Branch == 'Disneyland_California', mention_ride == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
##### paris #####
# proportion of positive sentiment with shopping words
pos_prop_w_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', mention_ride == 1) %>%
select(Review_ID, Review_Text) %>%
unnest_tokens(output = word, input = Review_Text) %>%
inner_join(afinn) %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / NROW(word))
# proportion of positive sentiment without shopping words
pos_prop_wt_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', mention_ride == 0) %>%
select(Review_ID, Review_Text) %>%
unnest_tokens(output = word, input = Review_Text) %>%
inner_join(afinn) %>%
mutate(positive = ifelse(value >= 0, 1, 0)) %>%
summarize(proportion_positive = sum(positive) / NROW(word))
# average sentiment score with shopping words
mean_w_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', mention_ride == 1) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
# average sentiment score without shopping words
mean_wt_shop_paris <- disneyland %>%
filter(Branch == 'Disneyland_Paris', mention_ride == 0) %>%
select(Review_ID,Review_Text)%>%
group_by(Review_ID)%>%
unnest_tokens(output=word,input=Review_Text)%>%
inner_join(afinn) %>%
ungroup() %>%
summarize(avg = mean(value))
Proportion of positive words in reviews w/wt shopping words
# make proportion plot
df_prop = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
prop_pos_words_w_shop = c(as.numeric(pos_prop_w_shop_hk), as.numeric(pos_prop_w_shop_ca), as.numeric(pos_prop_w_shop_paris)),
prop_pos_words_wt_shop = c(as.numeric(pos_prop_wt_shop_hk), as.numeric(pos_prop_wt_shop_ca), as.numeric(pos_prop_wt_shop_paris)))
df_prop_long <- tidyr::pivot_longer(df_prop, cols = c("prop_pos_words_w_shop", "prop_pos_words_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_prop_long, aes(x = branch, y = proportion, fill = category)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
scale_fill_manual(values = c("#0072B2", "#F0E442"),
labels=c("Mentioned", "Did not mention")) +
labs(title = "Proportion of Positive Words with and without ride experience", x = "Branch", y = "Proportion", fill = 'Ride experience mentioned') +
geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(size = 14, face = "bold"),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12))
Average sentiment scores of reviews w/wt shopping words
# make average score plot
df_mean = data.frame(branch = c('Hong Kong', 'California', 'Paris'),
mean_w_shop = c(as.numeric(mean_w_shop_hk), as.numeric(mean_w_shop_ca), as.numeric(mean_w_shop_paris)),
mean_wt_shop = c(as.numeric(mean_wt_shop_hk), as.numeric(mean_wt_shop_ca), as.numeric(mean_wt_shop_paris)))
df_mean_long <- tidyr::pivot_longer(df_mean, cols = c("mean_w_shop", "mean_wt_shop"), names_to = "category", values_to = "proportion")
ggplot(df_mean_long, aes(x = branch, y = proportion, fill = category)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", size = 0.5) +
scale_fill_manual(values = c("#0072B2", "#F0E442"),
labels=c("Mentioned", "Did not mention")) +
labs(title = "Average sentiment scores with and without ride experience", x = "Branch", y = "Average sentiment score", fill = 'Ride experience mentioned') +
geom_text(aes(label=round(proportion, 2)), position=position_dodge(width=0.9), vjust=-0.25) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(size = 14, face = "bold"),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12))
Regarding sentiment analysis, the difference in proportion of positive words between reviews with ride experience or not is ambiguous, but the average sentiment score for ride-related reviews are much lower, indicating that reviewers may have more extreme emotions toward ride experience. Thus, every branch should examine each of their ride services and make improvements accordingly.
We classified the topic words in the reviews into four categories: time, theme rides, dining, and customer services. We aim to find which specific topic words/word categories reviewers share in their reviews when they tend to provide a high & low rating.
Hypothesis 1:
Ho: Reviews discussing time words would not affect overall review
ratings.
Ha: Reviews discussing time words would affect overall review
ratings.
Hypothesis 2:
Ho: Reviews discussing food words would not affect overall review
ratings.
Ha: Reviews discussing food words would affect overall review
ratings.
Hypothesis 3:
Ho: Rides features would not affect overall review ratings.
Ha: Rides features would affect overall review ratings. Hypothesis
4:
Ho: Reviews discussing staff would not affect overall review
ratings.
Ha: Reviews discussing staff would affect overall review ratings.
We created a corpus of the reviews from the variable Review_Text using library(tm), and then preprocessed the corpus of reviews using functions from library(tm) as well. Specifically, we transformed text to lower case, removed punctuation, removed English stopwords using the dictionary tm::stopwords(“english”), removed numbers, remove whitespace, and stemed documents. We also retained all terms that appear in 5% or more reviews and converted the document-term-matrix to a data frame so that the column names would be the tokens.
#head(disneyland)
#install.packages("tm")
library(tm)
#install.packages('topicmodels')
library(topicmodels)
#Clean the Review_Text column:
corpus1 <- Corpus(VectorSource(disneyland$Review_Text))
#(a) transform text to lower case,
corpus1 = tm_map(corpus1,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus1, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus1 = tm_map(corpus1,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus1 = tm_map(corpus1,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus1, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus1 = tm_map(corpus1,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus1 = tm_map(corpus1,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus1 = tm_map(corpus1,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus1, FUN = stemDocument): transformation
## drops documents
dict1 = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(disneyland$Review_Text))),
lowfreq = 0)
dict_corpus1 = Corpus(VectorSource(dict1))
dtm1 <- DocumentTermMatrix(corpus1)
# Remove Sparse Terms
xdtm1 = removeSparseTerms(dtm1,sparse = 0.95)
#xdtm
# Complete Stems
xdtm1 = as.data.frame(as.matrix(xdtm1))
colnames(xdtm1) = stemCompletion(x = colnames(xdtm1),
dictionary = dict_corpus1,
type='prevalent')
colnames(xdtm1) = make.names(colnames(xdtm1))
#head(sort(colSums(xdtm1),decreasing = T))
We also created another document term matrix using Term Frequency - Inverse Document Frequency Weighting.
# Document Term Matrix - tfidf
dtm_tfidf1 = DocumentTermMatrix(x=corpus1,
control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf1 = removeSparseTerms(dtm_tfidf1,sparse = 0.95)
xdtm_tfidf1 = as.data.frame(as.matrix(xdtm_tfidf1))
colnames(xdtm_tfidf1) = stemCompletion(x = colnames(xdtm_tfidf1),
dictionary = dict_corpus1,
type='prevalent')
colnames(xdtm_tfidf1) = make.names(colnames(xdtm_tfidf1))
#head(sort(colSums(xdtm_tfidf1),decreasing = T))
Then we plotted a bar chart which contrasts the weights of term frequency and term frequency inverse document frequency weighting for the top 35 terms.
library(tidyr); library(dplyr); library(ggplot2); library(ggthemes);library(RColorBrewer)
data.frame(term = colnames(xdtm1),tf = colMeans(xdtm1), tfidf = colMeans(xdtm_tfidf1))%>%
arrange(desc(tf))%>%
top_n(35)%>%
gather(key=weighting_method,value=weight,2:3)%>%
ggplot(aes(x=term,y=weight,fill=weighting_method))+
geom_col(position='dodge')+
coord_flip()+
theme_economist()+
scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
After tokenizing the reviews’ word frequency as a whole, we could see that reviewers mentioned factors, such as time, rides, and food, very frequently, no matter using Term Frequency - Inverse Document Frequency Weighting or not. Therefore, topic word categories can be considered as time, theme rides, dining, and customer services, and topic words are time, rides, staff, and food. These motivated us to further separate ratings and sentiments to inspect words in negative-rating reviews (ratings under 3).
We created two document term matrices using both Term frequency and Term Frequency - Inverse Document Frequency Weighting for the negative-rating reviews and plotted a bar chart for the top 30 terms to compare two weightings. Similar to the bar chart of all-rating reviews above, Terms “ride” and “park” are heavy weighted. Term Frequency assigns them a heavy weight because they are the most frequently occurring term, and they appear in most of the reviews as well, while Term Frequency - Inverse Document Frequency assigns them a much lower weight. Topic word “food” is in the collection as well. Nevertheless, term “show” is not included while “staff” appears. And terms related to time are included a lot, such as “wait,” “minute,” and “hour.”
negativeDisney<-disneyland %>% subset(Rating == c(1,2))
## Warning in Rating == c(1, 2): longer object length is not a multiple of shorter
## object length
# Create a document-term matrix for the negative sentiment words
#Clean the Review_Text column:
corpus <- Corpus(VectorSource(negativeDisney$Review_Text))
#(a) transform text to lower case,
corpus = tm_map(corpus,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus = tm_map(corpus,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus = tm_map(corpus,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus = tm_map(corpus,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus = tm_map(corpus,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stemDocument): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
#inspect(dtm)
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(negativeDisney$Review_Text))),
lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))
# Remove Sparse Terms
xdtm = removeSparseTerms(dtm,sparse = 0.95)
#xdtm
# Complete Stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
#head(sort(colSums(xdtm),decreasing = T))
# Document Term Matrix - tfidf
dtm_tfidf = DocumentTermMatrix(x=corpus,
control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf = removeSparseTerms(dtm_tfidf,sparse = 0.95)
xdtm_tfidf = as.data.frame(as.matrix(xdtm_tfidf))
colnames(xdtm_tfidf) = stemCompletion(x = colnames(xdtm_tfidf),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm_tfidf) = make.names(colnames(xdtm_tfidf))
#head(sort(colSums(xdtm_tfidf),decreasing = T))
data.frame(term = colnames(xdtm),tf = colMeans(xdtm), tfidf = colMeans(xdtm_tfidf))%>%
arrange(desc(tf))%>%
top_n(30)%>%
gather(key=weighting_method,value=weight,2:3)%>%
ggplot(aes(x=term,y=weight,fill=weighting_method))+
geom_col(position='dodge')+
coord_flip()+
theme_economist()+
scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
After zooming into the negative-rating reviews, we would like to take a look at the neutral-rating and positive rating reviews as well. For neutral-rating reviews (rating=3), we plotted the bar chart again to contrast the weights of term frequency and term frequency inverse document frequency weighting for the top 30 terms. Terms “ride” and “park” are heavy weighted again. Time words and food remain as two of the focuses. Term “staff” is still in the top 30, but the term “show” appears this time.
neutralDisney<-disneyland %>% subset(Rating == 3)
# Create a document-term matrix for the negative sentiment words
#Clean the Review_Text column:
corpus <- Corpus(VectorSource(neutralDisney$Review_Text))
#(a) transform text to lower case,
corpus = tm_map(corpus,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus = tm_map(corpus,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus = tm_map(corpus,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus = tm_map(corpus,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus = tm_map(corpus,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stemDocument): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
#inspect(dtm)
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(neutralDisney$Review_Text))),
lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))
# Remove Sparse Terms
xdtm = removeSparseTerms(dtm,sparse = 0.95)
#xdtm
# Complete Stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
#head(sort(colSums(xdtm),decreasing = T))
# Document Term Matrix - tfidf
dtm_tfidf = DocumentTermMatrix(x=corpus,
control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf = removeSparseTerms(dtm_tfidf,sparse = 0.95)
xdtm_tfidf = as.data.frame(as.matrix(xdtm_tfidf))
colnames(xdtm_tfidf) = stemCompletion(x = colnames(xdtm_tfidf),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm_tfidf) = make.names(colnames(xdtm_tfidf))
#head(sort(colSums(xdtm_tfidf),decreasing = T))
data.frame(term = colnames(xdtm),tf = colMeans(xdtm), tfidf = colMeans(xdtm_tfidf))%>%
arrange(desc(tf))%>%
top_n(30)%>%
gather(key=weighting_method,value=weight,2:3)%>%
ggplot(aes(x=term,y=weight,fill=weighting_method))+
geom_col(position='dodge')+
coord_flip()+
theme_economist()+
scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
We also plotted term frequency versus term frequency inverse document frequency weighting top-30-term bar chart for the positive-rating reviews as shown below. We could see that terms like “love” and “enjoy” appear in the top 30, while time words
positiveDisney<-disneyland %>% subset(Rating == c(4,5))
## Warning in Rating == c(4, 5): longer object length is not a multiple of shorter
## object length
# Create a document-term matrix for the negative sentiment words
#Clean the Review_Text column:
corpus <- Corpus(VectorSource(positiveDisney$Review_Text))
#(a) transform text to lower case,
corpus = tm_map(corpus,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
## transformation drops documents
#(b) remove punctuation,
corpus = tm_map(corpus,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removePunctuation): transformation
## drops documents
#(c) remove English stopwords using the following dictionary tm::stopwords("english")
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
#(d) remove numbers
corpus = tm_map(corpus,FUN = removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeNumbers): transformation
## drops documents
#(e) remove whitespace
corpus = tm_map(corpus,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace): transformation
## drops documents
#(f) stem words
corpus = tm_map(corpus,FUN = stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stemDocument): transformation drops
## documents
dtm <- DocumentTermMatrix(corpus)
#inspect(dtm)
dict = findFreqTerms(DocumentTermMatrix(Corpus(VectorSource(positiveDisney$Review_Text))),
lowfreq = 0)
dict_corpus = Corpus(VectorSource(dict))
# Remove Sparse Terms
xdtm = removeSparseTerms(dtm,sparse = 0.95)
#xdtm
# Complete Stems
xdtm = as.data.frame(as.matrix(xdtm))
colnames(xdtm) = stemCompletion(x = colnames(xdtm),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm) = make.names(colnames(xdtm))
#head(sort(colSums(xdtm),decreasing = T))
# Document Term Matrix - tfidf
dtm_tfidf = DocumentTermMatrix(x=corpus,
control = list(weighting=function(x) weightTfIdf(x,normalize=F)))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
xdtm_tfidf = removeSparseTerms(dtm_tfidf,sparse = 0.95)
xdtm_tfidf = as.data.frame(as.matrix(xdtm_tfidf))
colnames(xdtm_tfidf) = stemCompletion(x = colnames(xdtm_tfidf),
dictionary = dict_corpus,
type='prevalent')
colnames(xdtm_tfidf) = make.names(colnames(xdtm_tfidf))
#head(sort(colSums(xdtm_tfidf),decreasing = T))
data.frame(term = colnames(xdtm),tf = colMeans(xdtm), tfidf = colMeans(xdtm_tfidf))%>%
arrange(desc(tf))%>%
top_n(30)%>%
gather(key=weighting_method,value=weight,2:3)%>%
ggplot(aes(x=term,y=weight,fill=weighting_method))+
geom_col(position='dodge')+
coord_flip()+
theme_economist()+
scale_fill_manual(values = brewer.pal(9, "Set2")[c(7, 8)])
## Selecting by tfidf
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
After plotting three term frequency versus term frequency inverse document frequency weighting top-30-term bar charts, we found that term “staff” only appears in top 30 of negative and neutral rating reviews. Term “food,” topic words related to “time,” and term “ride” are shown in all three types of rating reviews. Therefore, we would like to further investigate these terms separately.
Based on previous analysis, term “food” is a focus over all rating reviews.
# create a vector of food-related keywords
keywords <- c('food','foods', 'eating','eat','ate', 'restaurant', 'meal','drink','snack','churro','cake','turkey','corn','popcorn','pretzel','pizza','taco', 'burger','sandwiche','milkshake','fries', 'chicken','beef','pork', 'beer','wine','breakfast','lunch','dinner')
# create a logical vector indicating whether each Review_Text contains at least one keyword
contains_keyword <- grepl(paste0("\\b", keywords, "\\b", collapse = "|"), disneyland$Review_Text, ignore.case = TRUE)
# subset the original dataframe to create a new dataframe allFoods containing only rows with Review_text that contains at least one keyword
allFoods <- disneyland[contains_keyword, ]
str(allFoods)
## 'data.frame': 13639 obs. of 12 variables:
## $ Review_ID : int 670623270 670607911 670591897 670574142 670571027 670570869 670274554 670205135 670129921 670007081 ...
## $ Rating : int 4 4 3 3 2 5 5 3 3 3 ...
## $ Year_Month : chr "Apr 2019" "Apr 2019" "Apr 2019" "Mar 2019" ...
## $ Reviewer_Location: chr "United Arab Emirates" "Australia" "Singapore" "Malaysia" ...
## $ Review_Text : chr "Thanks God it wasn t too hot or too humid when I was visiting the park otherwise it would be a big issue (t"| __truncated__ "HK Disneyland is a great compact park. Unfortunately there is quite a bit of maintenance work going on at prese"| __truncated__ "Have been to Disney World, Disneyland Anaheim and Tokyo Disneyland but I feel that Disneyland Hong Kong is real"| __truncated__ "Think of it as an intro to Disney magic for the little ones. Almost all of the attractions can be completed in "| __truncated__ ...
## $ Branch : chr "Disneyland_HongKong" "Disneyland_HongKong" "Disneyland_HongKong" "Disneyland_HongKong" ...
## $ Year : int 2019 2019 2019 2019 2019 2019 2018 2019 2019 2019 ...
## $ Month : int 4 4 4 3 4 3 9 1 4 4 ...
## $ Rating_type : chr "positive" "positive" "neutral" "neutral" ...
## $ continent : chr "Asia" "Oceania" "Asia" "Asia" ...
## $ mention_ride : num 0 0 0 0 0 0 0 0 0 0 ...
## $ no_mention_ride : num 1 1 1 1 1 1 1 1 1 1 ...
We found out the total number of positive and negative words in the reviews and plotted bar charts.
allFoods%>%
group_by(Review_ID)%>%
unnest_tokens(output = word, input = Review_Text)%>%
inner_join(get_sentiments('bing'))%>%
group_by(sentiment)%>%
count()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 57961 of `x` matches multiple rows in `y`.
## ℹ Row 3619 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## # A tibble: 2 × 2
## # Groups: sentiment [2]
## sentiment n
## <chr> <int>
## 1 negative 54280
## 2 positive 122195
# bar chart for positive and negative sentiments in reviews mentioned foods over 5 ratings using 'bing' dictionary
allFoods %>%
select(Review_ID,Review_Text,Rating)%>%
group_by(Review_ID, Rating)%>%
unnest_tokens(output=word,input=Review_Text)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(Rating,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=Rating,y=proportion,fill=sentiment))+
geom_col()+
theme_economist()+
coord_flip()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 57961 of `x` matches multiple rows in `y`.
## ℹ Row 3619 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## `summarise()` has grouped output by 'Rating'. You can override using the
## `.groups` argument.
# bar chart for Rating Distribution for Reviews Mentioned Foods
ggplot(allFoods, aes(Rating))+
geom_bar(stat = "count",position = 'dodge', fill="mistyrose3")+
ggtitle("Rating Distribution for Reviews Mentioned Foods")
Considering that the reviews contains comments about other aspects, such as rides and shows, we further examine sentences mentioned foods by selecting foods-related sentences and stored in a new dataframe foodSentences_ratings, which contains combined food-related sentences (one reviewer may has two or more sentences), Review_ID, and Rating.
# Split the Review_Text column into sentences
allFoods_sentences <- allFoods %>%
unnest_tokens(output=sentence, input=Review_Text, "sentences")
# Filter the sentences containing at least one food word
foodSentences <- allFoods_sentences %>%
filter(grepl(paste(keywords, collapse="|"), sentence))
dim(foodSentences)
## [1] 41367 12
#head(foodSentences)
# Combine the sentences for each unique Reviewer_ID
foodSentences_combined <- aggregate(sentence ~ Review_ID, data = foodSentences, FUN = paste, collapse = " ")
#foodSentences_combined
# Join the foodSentences_combined and allFoods dataframes by Reviewer_ID
foodSentences_ratings <- left_join(foodSentences_combined, allFoods[, c("Review_ID", "Rating")], by = "Review_ID")
dim(foodSentences_ratings)
## [1] 13639 3
#head(foodSentences_ratings)
We used “bing” dictionary and ‘nrc’ lexicon to examine the sentiments in the food sentences.
#plot a bar chart with "bing" dictionary
foodSentences_ratings%>%
group_by(Review_ID)%>%
unnest_tokens(output = word, input = sentence)%>%
inner_join(get_sentiments('bing'))%>%
group_by(sentiment)%>%
count()%>%
ggplot(aes(x=sentiment,y=n,fill=sentiment))+
geom_col()+
theme_economist()+
guides(fill=F)+
coord_flip()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 970771 of `x` matches multiple rows in `y`.
## ℹ Row 2698 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# plot another bar chart based on ratings
foodSentences_ratings %>%
select(Review_ID,sentence,Rating)%>%
group_by(Review_ID, Rating)%>%
unnest_tokens(output=word,input=sentence)%>%
ungroup()%>%
inner_join(get_sentiments('bing'))%>%
group_by(Rating,sentiment)%>%
summarize(n = n())%>%
mutate(proportion = n/sum(n))%>%
ggplot(aes(x=Rating,y=proportion,fill=sentiment))+
geom_col()+
theme_economist()+
coord_flip()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 970771 of `x` matches multiple rows in `y`.
## ℹ Row 2698 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## `summarise()` has grouped output by 'Rating'. You can override using the
## `.groups` argument.
# plot a bar chart with ‘nrc’ lexicon
library(lexicon)
nrc<-read.csv("nrc.csv")
foodSentences_ratings%>%
group_by(Review_ID)%>%
unnest_tokens(output = word, input = sentence)%>%
inner_join(nrc)%>%
group_by(sentiment)%>%
count()%>%
ggplot(aes(x=reorder(sentiment,X = n), y=n, fill=sentiment))+
geom_col()+
guides(fill=F)+
coord_flip()+
theme_wsj()
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 14 of `x` matches multiple rows in `y`.
## ℹ Row 5150 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
From the plots above, we can see that overall most reviewers maintain a positive attitude towards foods. The number of positive sentiment is about twice amount of number of negative sentiment. The number of positive emotions based on “nrc” lexicon is also relatively high. Therefore, we further zoomed into negative rating sentences to investigate the emotions expressed about foods in the low ratings (Rating == c(1,2)).
negativeFoods<-foodSentences_ratings %>% subset(Rating == c(1,2))
## Warning in Rating == c(1, 2): longer object length is not a multiple of shorter
## object length
#head(negativeFoods)
negativeFoods%>%
group_by(Review_ID)%>%
unnest_tokens(output = word, input = sentence)%>%
inner_join(nrc)%>%
group_by(sentiment)%>%
count()%>%
arrange(desc(n))
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10 of `x` matches multiple rows in `y`.
## ℹ Row 4201 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## # A tibble: 10 × 2
## # Groups: sentiment [10]
## sentiment n
## <chr> <int>
## 1 positive 3452
## 2 trust 2073
## 3 negative 2029
## 4 joy 1849
## 5 anticipation 1763
## 6 sadness 922
## 7 anger 907
## 8 fear 863
## 9 surprise 756
## 10 disgust 744
#plot a bar chart with "bing" dictionary
negativeFoods%>%
group_by(Review_ID)%>%
unnest_tokens(output = word, input = sentence)%>%
inner_join(get_sentiments('bing'))%>%
group_by(sentiment)%>%
count()%>%
ggplot(aes(x=sentiment,y=n,fill=sentiment))+
geom_col()+
theme_economist()+
guides(fill=F)+
coord_flip()
## Joining with `by = join_by(word)`
# plot a bar chart with ‘nrc’ lexicon
negativeFoods%>%
group_by(Review_ID)%>%
unnest_tokens(output = word, input = sentence)%>%
inner_join(nrc)%>%
group_by(sentiment)%>%
count()%>%
ggplot(aes(x=reorder(sentiment,X = n), y=n, fill=sentiment))+
geom_col()+
guides(fill=F)+
coord_flip()+
theme_wsj()
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10 of `x` matches multiple rows in `y`.
## ℹ Row 4201 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
The plots above show that in the negative ratings (Ratings under 3), most reviewers hold a negative attitude towards foods, which means that foods-related things may play a role in low ratings. Below we checked 20 reviews mentioned foods in low ratings.
# we checked 20 reviews, but only show 6 here.
head(negativeFoods$sentence, 3)
## [1] "it was absolutely freezing which obviously is not disney's fault but it was snowing when we arrived which was quite magical but the snow settled on ice and previous snow which basically created a lethal surface with which to walk on! in fairness the staff were very pleasant at disney but i think they knew they were very wrong in not cleaning putting something down to make floor less slippery.it ruined my holiday but i think the children still enjoyed themselves...favourites were buzz lightyear ride, peter pan ride and live animated stitch.glad i looked on here before we went for tips.. take rolls batches cheese and ham from buffet breakfast to eat throughout the day.we actually took a suitcase of our own food drink which was a life saver.rainforest cafe was great to eat...everywhere is expensive but you get an experience with this too!"
## [2] "we spent the 20 dec 2010 in the disney park and by lunchtime attractions were starting to close or break down. we walked to mgm studios and easily got in an into the first building which was populated with shops and food outlets. the open air magic carpet ride was open but it was left to the riders to clear the deep snow from the seats. people headed for the indoor attractions to find that under cover marshalling areas were closed on the pretext of dangerous snow levels on roofs and people were being advised to return later. fast pass booths were closed presunably because of weather damage. shops and restaurants. these places became almost claustrphobically overpopulated. by lunchtime we had enjoyed only 4 hot chocolates each and one ride for with our grand children so we gave up and retried the disney park. a slight easing of the weather enabled two of the parades to operate but the main wish of our grandkids to see princesses was not achieved. in the end we booked an expensive character dinner just to enable them to meet mickey, minnie and some of the animal characters."
## [3] "the area for the buses, shuttles, taxis and private cars is three narrow lanes right outside of the train station. fortunately we overheard someone talking about our hotel and asked if they were waiting for the shuttle, which they were. funny how he didn't count or organize the people in any manner to avoid the angry, frustrated group that had to either rearrange, or get off the shuttle. this whole ordeal was repeated again every time we took the radisson blu shuttle. every restaurant we went in, with the exception of the hakuna matata, was beyond filthy. overflowing garbage cans, tables with gobs of food on, around and under them and barely a cast member in sight cleaning up."
tail(negativeFoods$sentence, 3)
## [1] "we went to disneyland paris during the easter break with family members who had come over from the states. not a great first impression. not great. we did manage to eat lunch early enough that we didn't have to wait a long time. however, the food that we got was not what we ordered. none of us had the wherewithal to go back and try to fix the order. so we just ate what we got. there were plenty of other foodcarts around the park that were empty. not enough staff at the park to operate these."
## [2] "people that do not have that much to spend will be held back, because people with money obviously can skip the lines...absolutely ridiculous.getting something to eat was horrible, again lines everywhere...just to eat someting right..."
## [3] "walt disney would be horrified, not enough eating places for the amount of people or rest rooms."
There are many foods-related issues, including over-pricing, not enough and unclean eating places, offering too much unhealthy/poor quality food, and too long waiting time.
Considering words related to time were mentioned frequently by
reviewers giving negative reviews, we would like to investigate more
about time words in the reviews. By using spacyr package
that provides an interface to the spaCy natural language processing
(NLP) library in Python, we were able to do Named Entity Recognition
(NER). NER can be helpful for text classification tasks and identify and
classify named entities in text. Through using function
spacy_parse, we could access time words through the time
entity, which is consisted of “TIME_I” and “TIME_B.” The letters “B” and
“I” at the end of each entity type indicate whether the entity is the
beginning (“B”), such as “45,” or inside (“I”), such as “minites,” of a
multi-word entity.
library(spacyr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
#spacy_upgrade()
# Initialize connection to Python spaCy and create automatic Named Entity Recognition features.
#spacy_initialize("en_core_web_sm")
#spacy_install("en_core_web_sm")
#spacy_initialize(model = "en_core_web_sm")
ner_negativeDisney <- spacy_parse(negativeDisney$Review_Text, tag = FALSE,
entity = TRUE,lemma = FALSE)
## Found 'spacy_condaenv'. spacyr will use this environment
## successfully initialized (spaCy Version: 3.5.1, language model: en_core_web_sm)
## (python options: type = "condaenv", value = "spacy_condaenv")
#glimpse(ner_negativeDisney)
wide_ner_negativeDisney <- dcast(ner_negativeDisney, doc_id ~ entity, value.var="entity")
## Aggregation function missing: defaulting to length
glimpse(wide_ner_negativeDisney)
## Rows: 1,642
## Columns: 36
## $ doc_id <chr> "text1", "text10", "text100", "text1000", "text1001", "t…
## $ Var.2 <int> 168, 106, 89, 40, 32, 147, 254, 323, 1408, 85, 436, 300,…
## $ CARDINAL_B <int> 1, 0, 0, 1, 0, 0, 2, 3, 19, 2, 13, 5, 0, 0, 9, 4, 2, 1, …
## $ CARDINAL_I <int> 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 3, 1, 0, 0, 1, 3, 0, 0, 0,…
## $ DATE_B <int> 2, 0, 1, 0, 0, 0, 8, 0, 8, 0, 3, 0, 0, 0, 3, 2, 0, 3, 2,…
## $ DATE_I <int> 2, 0, 0, 0, 0, 0, 6, 0, 11, 0, 4, 0, 0, 0, 1, 2, 0, 4, 4…
## $ EVENT_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ EVENT_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FAC_B <int> 2, 1, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1, 0, 0, 3, 1, 1, 1, 0,…
## $ FAC_I <int> 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0,…
## $ GPE_B <int> 0, 0, 1, 1, 1, 0, 1, 4, 3, 0, 1, 4, 0, 0, 3, 1, 1, 6, 2,…
## $ GPE_I <int> 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ LANGUAGE_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LAW_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LAW_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LOC_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 2, 3, 0, 0, 0,…
## $ LOC_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0,…
## $ MONEY_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ MONEY_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ NORP_B <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ NORP_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ORDINAL_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 6, 1,…
## $ ORG_B <int> 1, 1, 1, 0, 0, 3, 0, 4, 4, 1, 2, 1, 0, 2, 3, 5, 5, 4, 0,…
## $ ORG_I <int> 0, 0, 1, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 1, 2, 0, 2, 4, 0,…
## $ PERCENT_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2,…
## $ PERCENT_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2,…
## $ PERSON_B <int> 2, 0, 0, 0, 0, 0, 0, 1, 5, 1, 1, 0, 0, 0, 1, 2, 0, 0, 0,…
## $ PERSON_I <int> 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ PRODUCT_B <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PRODUCT_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ QUANTITY_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ QUANTITY_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ TIME_B <int> 0, 1, 1, 0, 0, 0, 2, 4, 10, 2, 2, 3, 0, 0, 4, 4, 1, 2, 0…
## $ TIME_I <int> 0, 0, 1, 0, 0, 0, 6, 5, 11, 2, 2, 5, 0, 0, 8, 8, 1, 1, 0…
## $ WORK_OF_ART_B <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ WORK_OF_ART_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Extract time words.
time_entities <- subset(ner_negativeDisney, entity == c("TIME_I", "TIME_B")) %>%
group_by(token) %>% summarize(count = n())%>%
ungroup()%>%
arrange(desc(count))%>%top_n(15)
## Warning in entity == c("TIME_I", "TIME_B"): longer object length is not a
## multiple of shorter object length
## Selecting by count
time_entities
## # A tibble: 15 × 2
## token count
## <chr> <int>
## 1 minutes 290
## 2 hours 229
## 3 hour 183
## 4 an 97
## 5 2 87
## 6 pm 78
## 7 minute 77
## 8 10 56
## 9 45 51
## 10 1 48
## 11 30 47
## 12 a 45
## 13 am 42
## 14 about 41
## 15 3 40
# Plot a barchart for time words
#install.packages("pals")
library("pals")
ggplot(time_entities, aes(x=reorder(token,-count), y=count)) +
geom_bar(stat = "identity", aes(fill = as.factor(token))) +
labs(title = "Time Words in Negative Disneyland Reviews",
x = "Time words", y = "Count") +
theme_economist()+
scale_fill_manual(values=as.vector(brewer.set2(20)))
The time plot above shows that hours and minutes are mentioned frequently in negative reviews, so we speculate that many visitors’ waiting time are quite long. However, these are the unit words, we need to construct predictive models to see how time issues related to low ratings.